## Movie Recommender System

### KNNeighbors and KMeans

In [1]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from scipy.sparse import linalg as spla
from time import time

### Notes
 - Create a graph showing the distribution of average ratings for all users
 - Maybe another graph grouped by clusters

In [None]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv", nrows = 10000)

In [None]:
ratings.head(20)

In [None]:
#Create variable "count" for number of ratings for each movie
ratings = ratings.merge(ratings.groupby("movieId").size().reset_index(name='count'), how='right', on='movieId')

In [None]:
#Drop any movies that have 17 or fewer ratings
ratings = ratings[ratings["count"]>17].sort_values(['userId', 'movieId'])

In [None]:
#Find the minimum number of movies reviewed by a user
reviews_per_user = ratings.groupby("userId").userId.apply(lambda x: len(x))
reviews_per_user.describe().min()

In [None]:
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ratings.movieId.unique()), ordered=True)

row = ratings.userId.astype(user_c).cat.codes
col = ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ratings['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))



In [None]:
df.head()

In [None]:
"""#Change to dense for viewing its data
dense = sparse_matrix.todense()
"""

In [None]:
#Initialize nearest neighbors model
neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')

In [None]:
#Fit on the movie ratings we have
model = neigh.fit(sparse_matrix)

In [None]:
m = pd.read_csv(path + "movies.csv")

In [None]:
m.head()

In [None]:
def movies_not_seen(df, user_id, matrix, movie_c):
    """
    Determine what movies a user has not seen; 
    i.e. potential movies to recommend
    
    Parameters:
        df (DataFrame): Dataframe of movies with their titles and genres
        user_id (int): id for user we want to recommend something to
        matrix (csr matrix): user-movie matrix of movie ratings
        
    Returns:
        new (csr matrix): the columns of matrix corresponding to the movies
                          that user_id hasn't seen
        not_seen (list): the indices of the movies in the user-movie matrix
                         that user_id hasn't seen
    """
    #get the movies the user has seen
    movieids_seen = df[df.userId == user_id].movieId.values
    
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
            
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    
    #cut out the movies (columns) userid has seen from our matrix
    new = matrix[:,not_seen]
    
    return new, not_seen

def count_not_seen(df, movie_c, not_seen, indices):
    """
    This function returns the number a times a movie not seen by the user was seen by the user's nearest neighbors
    Input:
    Output
    """
    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    return np.array(ratings_count)

def knn_recommend_movies(df, matrix, fitted_model, user_id, n, movie_c):
    """
    This function uses the K-nearest neighbors algorithm to suggest a movie for the user to watch.
    
    Inputs:
    df: sparse dataframe containing user and movie information. Contains columns user_id, movie_id, rating
    matrix: sparse matrix containing user_id on the y axis and movie_id on the x_axis, with the rating for each movie in the rows
    fitted_model: the type on model we run this clustering method on  
    user_id: the user we want to recommend a movie for
    n: number of movies to recommend (MAY NOT USE_ CHECK BEFORE FINAL)
    
    Outputs: 
    movie_id, title. The id number and title of the recommended movie
    """
    alpha=1      #we recommend 1 movie 
    new, not_seen = movies_not_seen(df, user_id, matrix, movie_c)
    
    #Initialize nearest neighbors model
    neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')
    #Fit on the movie ratings we have
    model = neigh.fit(sparse_matrix)
    
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()

    #find how many times each movie has been rated by the neighbors
    ratings_count = count_not_seen(df, movie_c, not_seen, indices)
    
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

In [None]:
#FIXME: SPARSE MATRIX IS FILLED WITH 0 NOT 2.5
print(knn_recommend_movies(df, sparse_matrix, model, 12, 1, movie_c))

In [None]:
x = np.array([[1,1,.5],
        [1,6,2],
        [2,7,4],
        [3,2,4.5],
        [3,3,3],
        [3,4,1],
        [4,1,2],
        [5,4,1.5],
        [6,3,5],
        [7,10,4.5],
        [8,5,4.5],
        [9,8,.5],
        [9,9,1.5],
        [10,3,3.5]])
df = pd.DataFrame(x,columns=['userId','movieId','rating'])

user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

dense = sparse_matrix.todense()

In [None]:
def model(avg_rating, score_from_model, recommend = False):
    recommended_score = avg_rating + score_from_model
    if recommended_score >= 3.5:
        recommend = True
    pass

In [None]:
def knn_recommend_movies_old(df, matrix, fitted_model, user_id, n):
    """
    This function uses the K-nearest neighbors algorithm to suggest a movie for the user to watch.
    
    Inputs:
    df: sparse dataframe containing user and movie information. Contains columns user_id, movie_id, rating
    matrix: sparse matrix containing user_id on the y axis and movie_id on the x_axis, with the rating for each movie in the rows
    fitted_model: the type on model we run this clustering method on  
    user_id: the user we want to recommend a movie for
    n: number of movies to recommend (MAY NOT USE_ CHECK BEFORE FINAL)
    
    Outputs: 
    movie_id, title. The id number and title of the recommended movie
    """
    alpha=1
    movieids_seen = df[df.userId == user_id].movieId.values
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    #cut out the movies (columns) userid has seen from our matrix
    new = matrix[:,not_seen]
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()
    #find how many times each movie has been rated by the neighbors
    neigh_movies = []

    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    ratings_count = np.array(ratings_count)
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

print(knn_recommend_movies_old(df, sparse_matrix, model, 12, 1))

## inherit from sklearn?
 - Scoring Accuracy:
 - predict for every movie of the user (binary recommend/do not recommend)
 - that will be your 'x test'
 - create a 'y test' that gives if the user liked each movie or not
 - percent correctly predicted.

## KMeans - Clustering Users

Method for predicting user $i$'s movie rating for movie $j$:

$\bar{u}_i:$ User $i$'s average movie rating

$\bar{m}_j:$ Cluster's average rating for movie $j$

$\bar{c}:$ Cluster's average movie rating

$C:$ Cluster corresponding to user $i$

$M:$ All movies

$\gamma_j : \frac{\sum_{i\in C}{\mathbbm{1}(r_{i,j})}}{\sum_{i\in C, k\in M}\mathbbm{1}(r_{i,k})}$

$$prediction_{i,j} = \bar{u}_i + (\bar{m}_j - \bar{c})*\frac{\gamma_j}{\max_{k\in M}{\gamma_k}} $$



In [2]:
def KMeans_Predict(model, df):
    """
    Use a fitted KMeans clustering model to predict a user's movie rating
    
    Parameter:
        model (): Fitted KMeans Model
        df (Sparse DataFrame): user-movie rating sparse dataframe
    
    Returns:
        df (DataFrame): New Dataframe with NaNs filled with predicted ratings
        test (DataFrame): Dataframe for testing that includes predictions rather
                          than any actual ratings.
    """
    #Assign clusters
    df['cluster'] = model.labels_
    #Find the proportion of ratings given for each movie by cluster
    share_df = df.groupby(['cluster']).count()/df.groupby(['cluster']).count().sum(1).values.reshape((-1,1)) #SHARE
    #Find the average rating for each movie by cluster
    cluster_avgs_df = df.groupby(["cluster"]).mean()  #m_bar
    #Define the average movie rating over all movies by user
    df['avg'] = df.mean(1)                            #u_bar
    #Find the average movie rating over all movies by cluster
    cluster_avgs = df.groupby(["cluster"]).mean().avg #c_bar
    #Calculate the adjustment for predicting purposes
    diff_df = (cluster_avgs_df - cluster_avgs.values.values.reshape((-1,1)))*(share_df/share_df.max(0))
    #If no one in cluster has seen the movie, adjustment is zero
    diff_df = diff_df.fillna(0)
    #Make the diff_df same size as df
    update_df = df[['cluster']].reset_index().merge(diff_df, how='left', on='cluster').set_index('index')
    #Add the adjustment to each user's avg movie rating
    update_df = update_df.iloc[:,1:].apply(lambda x: x + df.avg)
    #Fill the NaNs with the predictions
    df.update(update_df, overwrite=False)
    
    return df, update_df

def Test_KMeans(updated, predictions):
    return ((updated - predictions)**2).sum().sum()

def standardize(df):
    #Only pass in columns of movie_IDs to standardize movie ratings for
    #Wouldn't want to change clusters
    clipped = df.clip(.5, 5)
    return clipped.apply(lambda x: round(2*x)/2)
    

In [3]:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv", nrows = 10000)
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ratings.movieId.unique()), ordered=True)

row = ratings.userId.astype(user_c).cat.codes
col = ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ratings['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

In [4]:
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories) #,
                         #default_fill_value=2.5)
#Initialize KMeans model
##Kmeans can't be performed in SciKitLearn on data with missing values
##So we fill NA with 2.5 to find the clusters
KM_model = KMeans(n_clusters=5).fit(dfs.fillna(2.5))
KM_model.labels_
df = dfs.copy()

In [6]:
new, test = KMeans_Predict(KM_model, df)
SSR = Test_KMeans(new, test)

In [13]:
print("Sum of squared residuals:", SSR)

Sum of squared residuals: 5035.22465712373
