## Movie Recommender System

### KNNeighbors and KMeans

In [1]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from scipy.sparse import linalg as spla

In [2]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv", nrows = 10000)

In [3]:
#Create variable "count" for number of ratings for each movie
ratings = ratings.merge(ratings.groupby("movieId").size().reset_index(name='count'), how='right', on='movieId')

In [4]:
#Drop any movies that have 17 or fewer ratings
ratings = ratings[ratings["count"]>17].sort_values(['userId', 'movieId'])

In [5]:
#Find the minimum number of movies reviewed by a user
reviews_per_user = ratings.groupby("userId").userId.apply(lambda x: len(x))
reviews_per_user.describe().min()

1.0

In [6]:
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ratings.movieId.unique()), ordered=True)

row = ratings.userId.astype(user_c).cat.codes
col = ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ratings['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))



In [None]:
"""#Change to dense for viewing its data
dense = sparse_matrix.todense()
"""

In [None]:
#Initialize nearest neighbors model
neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')

In [None]:
#Fit on the movie ratings we have
model = neigh.fit(sparse_matrix)

In [None]:
m = pd.read_csv(path + "movies.csv")

In [None]:
m.head()

In [None]:
def movies_not_seen(df, user_id, matrix, movie_c):
    """
    Determine what movies a user has not seen; 
    i.e. potential movies to recommend
    
    Parameters:
        df (DataFrame): Dataframe of movies with their titles and genres
        user_id (int): id for user we want to recommend something to
        matrix (csr matrix): user-movie matrix of movie ratings
        
    Returns:
        new (csr matrix): the columns of matrix corresponding to the movies
                          that user_id hasn't seen
        not_seen (list): the indices of the movies in the user-movie matrix
                         that user_id hasn't seen
    """
    #get the movies the user has seen
    movieids_seen = df[df.userId == user_id].movieId.values
    
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
            
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    
    #cut out the movies (columns) userid has seen from our matrix
    new = matrix[:,not_seen]
    
    return new, not_seen

def count_not_seen(df, movie_c, not_seen, indices):
    """
    This function returns the number a times a movie not seen by the user was seen by the user's nearest neighbors
    Input:
    Output
    """
    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    return np.array(ratings_count)

def knn_recommend_movies(df, matrix, fitted_model, user_id, n, movie_c):
    """
    This function uses the K-nearest neighbors algorithm to suggest a movie for the user to watch.
    
    Inputs:
    df: sparse dataframe containing user and movie information. Contains columns user_id, movie_id, rating
    matrix: sparse matrix containing user_id on the y axis and movie_id on the x_axis, with the rating for each movie in the rows
    fitted_model: the type on model we run this clustering method on  
    user_id: the user we want to recommend a movie for
    n: number of movies to recommend (MAY NOT USE_ CHECK BEFORE FINAL)
    
    Outputs: 
    movie_id, title. The id number and title of the recommended movie
    """
    alpha=1      #we recommend 1 movie 
    new, not_seen = movies_not_seen(df, user_id, matrix, movie_c)
    
    #Initialize nearest neighbors model
    neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')
    #Fit on the movie ratings we have
    model = neigh.fit(sparse_matrix)
    
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()

    #find how many times each movie has been rated by the neighbors
    ratings_count = count_not_seen(df, movie_c, not_seen, indices)
    
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

In [None]:
#FIXME: SPARSE MATRIX IS FILLED WITH 0 NOT 2.5
print(knn_recommend_movies(df, sparse_matrix, model, 12, 1, movie_c))

In [None]:
x = np.array([[1,1,.5],
        [1,6,2],
        [2,7,4],
        [3,2,4.5],
        [3,3,3],
        [3,4,1],
        [4,1,2],
        [5,4,1.5],
        [6,3,5],
        [7,10,4.5],
        [8,5,4.5],
        [9,8,.5],
        [9,9,1.5],
        [10,3,3.5]])
df = pd.DataFrame(x,columns=['userId','movieId','rating'])

user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

dense = sparse_matrix.todense()

In [None]:
def model(avg_rating, score_from_model, recommend = False):
    recommended_score = avg_rating + score_from_model
    if recommended_score >= 3.5:
        recommend = True
    pass

In [None]:
def knn_recommend_movies_old(df, matrix, fitted_model, user_id, n):
    """
    This function uses the K-nearest neighbors algorithm to suggest a movie for the user to watch.
    
    Inputs:
    df: sparse dataframe containing user and movie information. Contains columns user_id, movie_id, rating
    matrix: sparse matrix containing user_id on the y axis and movie_id on the x_axis, with the rating for each movie in the rows
    fitted_model: the type on model we run this clustering method on  
    user_id: the user we want to recommend a movie for
    n: number of movies to recommend (MAY NOT USE_ CHECK BEFORE FINAL)
    
    Outputs: 
    movie_id, title. The id number and title of the recommended movie
    """
    alpha=1
    movieids_seen = df[df.userId == user_id].movieId.values
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    #cut out the movies (columns) userid has seen from our matrix
    new = matrix[:,not_seen]
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()
    #find how many times each movie has been rated by the neighbors
    neigh_movies = []

    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    ratings_count = np.array(ratings_count)
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

print(knn_recommend_movies_old(df, sparse_matrix, model, 12, 1))

## inherit from sklearn?
 - Scoring Accuracy:
 - predict for every movie of the user (binary recommend/do not recommend)
 - that will be your 'x test'
 - create a 'y test' that gives if the user liked each movie or not
 - percent correctly predicted.

## KMeans - Clustering Users

In [7]:
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories) #,
                         #default_fill_value=2.5)

In [8]:
#Initialize KMeans model
##Kmeans can't be performed in SciKitLearn on data with missing values
##So we fill NA with 2.5 to find the clusters
KM_model = KMeans(n_clusters=5).fit(dfs.fillna(2.5))
KM_model.labels_


array([4, 4, 3, 2, 1, 2, 4, 1, 2, 2, 3, 2, 1, 4, 2, 2, 2, 2, 2, 2, 0, 0,
       0, 3, 0, 1, 2, 2, 1, 2, 4, 1, 2, 1, 0, 2, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 0, 2, 0, 0, 2, 0, 3, 4, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 2, 4, 2,
       3, 0, 2, 0, 0, 2, 1, 4, 4, 4, 2, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 1,
       3], dtype=int32)

In [176]:
np.argmax((dfs.mean(0).values - 2.5)*(dfs.notnull().sum().values))

17

In [177]:
np.argmax(dfs.mean(0).values)

44

In [188]:
KM_model.cluster_centers_

array([[2.92857143, 2.75      , 3.25      , 2.75      , 3.67857143,
        4.21428571, 3.78571429, 3.42857143, 2.85714286, 2.53571429,
        2.5       , 2.5       , 2.60714286, 2.75      , 2.85714286,
        4.60714286, 2.39285714, 4.39285714, 2.28571429, 2.5       ,
        2.67857143, 3.10714286, 2.60714286, 2.60714286, 2.57142857,
        2.71428571, 2.64285714, 2.67857143, 2.78571429, 3.67857143,
        3.17857143, 2.67857143, 2.5       , 3.17857143, 3.10714286,
        2.78571429, 4.07142857, 2.71428571, 2.60714286, 3.92857143,
        2.75      , 2.57142857, 2.35714286, 2.5       , 3.5       ,
        3.35714286, 2.60714286, 3.39285714, 3.        , 3.21428571,
        3.35714286, 2.82142857, 3.42857143, 2.89285714, 3.57142857,
        3.10714286, 3.14285714, 3.03571429, 3.14285714, 2.82142857,
        2.92857143, 2.96428571, 3.17857143, 3.14285714, 3.28571429,
        3.10714286, 2.82142857, 3.39285714, 3.64285714, 3.07142857,
        3.07142857, 2.85714286, 2.85714286, 2.78

In [381]:
def KMeans_Test(model, df, n, alpha=1.):
    """
    Test the accuracy of a fitted KMeans clustering model.
        To do this we randomly select n movies. For each of these movies
        we replace every non-NAN entry to NAN, then predict the new ratings 
        as the averages of the non-NAN ratings in their KMeans clusters. 
        Then compare these predicted ratings with the actual ratings. 
        Currently using the sum of squared residuals. 
        Change to softmax loss function??
    
    Parameters:
        model (KMeans): The fitted model
        df (DataFrame): The sparse dataframe including NANs that the
                        model was trained on
        n (int): The number of movies to test on
        alpha (float): hyperparameter
    
    Returns:
        score (float): The accuracy of the KMeans clustering
        predictions (list): A flattened list of all predictions made
        actual (list): A flattened list of all the true ratings
    """
    labels = model.labels_
    #Choose indices for the n random movies
    movie_ids = df.iloc[:,np.random.randint(0, df.shape[1], 5)].columns.values#This has been changed from: np.random.randint(0, df.shape[1], n)
    actual, predictions = np.array([]), list()
    for m_id in movie_ids:
        #Find the users that have rated the m_id movie
        m_rated_mask = df.loc[:,m_id].notnull().values.values
        user_ids = df.iloc[m_rated_mask].index.values
        #Grab the ratings and store them in the flattened actual list
        actual = np.concatenate((actual,df.iloc[m_rated_mask].loc[:,m_id].values.values))
        #Find the clusters for each user (df index starts at 1)
        clusters = labels[m_rated_mask]
        #Calculate the predicted ratings as the average of the ratings
        ##of the other users in their clusters. This is different than using the 
        ##cluster centers because the cluster centers used a dataframe with 2.5
        ##filled in for every NaN value
        for i, user_id in enumerate(user_ids):
            #Remove the user_id from the dataframe for cluster comparisons
            temp_df = df[df.index != user_id].copy()
            temp_labels = np.delete(labels, np.where(m_rated_mask)[0][i])
            neighbor_ratings = temp_df.iloc[temp_labels == clusters[i]].loc[:,m_id]
            neighbor_avgs = temp_df.iloc[temp_labels == clusters[i]].mean(1)
            user_avg = df.loc[user_id].mean()
            #Calculate the number of ratings this user's neighbors have given
            ##for every movie that this user hasn't seen
            #within_cluster_index = np.where(df.iloc[labels==clusters[i]].columns.values == m_id)[0][0]
            #not_seen = np.concatenate((df.iloc[labels==clusters[i]].iloc[i].isnull().values,[within_cluster_index]))
            num_ratings = temp_df.iloc[temp_labels == clusters[i]].iloc[:,df.loc[user_id].isnull().values].notnull().sum()
            total = num_ratings.sum()
            try:
                share = (num_ratings/total).loc[m_id]
            except:
                print("Movie ID:", m_id)
                print("User ID:", user_id)
                print("cluster:", cluster)
                print('i:', i)
                print("Num_ratings:", num_ratings, sep="\n")
                raise
            predictions.append(round(2*(user_avg + np.mean(neighbor_ratings - neighbor_avgs)*alpha*share))/2)
            print("worked")
    
    return np.sum((np.array(predictions) - actual)**2), predictions, actual

In [376]:
m_id, user_id, i = 736, 5, 0

In [375]:
cluster = 3

In [378]:
temp_df.iloc[labels == cluster]#.iloc[:,df.loc[user_id].isnull().values].notnull().sum()

IndexError: indices are out-of-bounds

In [368]:
clusters

array([4, 3, 2, 0, 0, 0, 3, 0, 1, 1, 1, 0, 2, 2, 0, 2, 0, 0, 3, 0, 3, 2,
       3, 0, 0, 2, 2, 3], dtype=int32)

In [382]:
KMeans_Test(KM_model, dfs, 5)

Movie ID: 1097
User ID: 1
cluster: 3
i: 0
Num_ratings:
1       2
21      0
39      1
110     6
111     1
150     1
153     0
165     0
185     0
231     0
316     4
329     0
339     2
344     2
356     2
364     3
377     1
380     2
457     2
480     6
500     1
527     1
587     2
588     3
590     2
592     0
595     1
597     1
608     0
648     0
733     1
736     1
780     2
858     1
1073    3
1197    5
1210    8
1213    0
1265    3
1270    4
1580    5
1721    3
1923    1
2028    3
2329    1
2571    5
2858    4
3578    3
dtype: int64


KeyError: 1097

In [271]:
cluster = 4

In [279]:
df.iloc[labels == cluster].iloc[:,df.iloc[labels == cluster].iloc[i].isnull().values].notnull().sum()

21      5
47      5
50      6
111     3
329     4
339     4
457     6
590     5
733     5
924     4
1208    4
1259    4
1265    5
1923    4
2329    4
dtype: int64

In [237]:
(df.iloc[labels == cluster].iloc[:,df.loc[3].isnull().values].notnull().sum() / df.iloc[labels == cluster].iloc[:,df.loc[3].isnull().values].notnull().sum().sum()).iloc[0]#.loc[21]

0.025

In [245]:
df.iloc[df.iloc[:,m_index].notnull().values.values].iloc[0]

1       NaN
21      NaN
32      3.5
39      NaN
47      3.5
50      3.5
110     NaN
111     NaN
150     NaN
153     NaN
165     NaN
185     NaN
231     NaN
253     4.0
260     4.0
296     4.0
316     NaN
318     4.0
329     NaN
339     NaN
344     NaN
356     NaN
364     NaN
367     3.5
377     NaN
380     NaN
457     NaN
480     NaN
500     NaN
527     NaN
       ... 
858     NaN
924     3.5
1073    NaN
1097    4.0
1136    3.5
1196    4.5
1197    NaN
1198    4.5
1208    3.5
1210    NaN
1213    NaN
1240    4.0
1259    4.0
1265    NaN
1270    NaN
1291    3.5
1580    NaN
1721    NaN
1923    NaN
2028    NaN
2329    NaN
2571    NaN
2716    3.5
2762    4.0
2858    NaN
2959    4.0
3578    NaN
4306    4.0
4993    5.0
5952    5.0
Name: 1, Length: 74, dtype: Sparse[float64, nan]
IntIndex
Indices: array([ 2,  4,  5, 13, 14, 15, 17, 23, 30, 33, 36, 45, 47, 48, 49, 51, 52,
       55, 56, 59, 66, 67, 69, 71, 72, 73], dtype=int32)