## Movie Recommender System

### KNNeighbors and KMeans

In [2]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from scipy.sparse import linalg as spla
from time import time

### Notes
 - Create a graph showing the distribution of average ratings for all users
 - Maybe another graph grouped by clusters

In [22]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv", nrows = 10000)

In [4]:
#Create variable "count" for number of ratings for each movie
ratings = ratings.merge(ratings.groupby("movieId").size().reset_index(name='count'), how='right', on='movieId')

In [5]:
#Drop any movies that have 17 or fewer ratings
ratings = ratings[ratings["count"]>17].sort_values(['userId', 'movieId'])

In [6]:
#Find the minimum number of movies reviewed by a user
reviews_per_user = ratings.groupby("userId").userId.apply(lambda x: len(x))
reviews_per_user.describe().min()

1.0

## inherit from sklearn?
 - Scoring Accuracy:
 - predict for every movie of the user (binary recommend/do not recommend)
 - that will be your 'x test'
 - create a 'y test' that gives if the user liked each movie or not
 - percent correctly predicted.

## KMeans - Clustering Users

Method for predicting user $i$'s movie rating for movie $j$:

$P_{i, j}:$ Prediction for user $i$'s rating of movie $j$

$\bar{u}_i:$ User $i$'s average movie rating

$\bar{m}_j:$ Cluster's average rating for movie $j$

$\bar{c}:$ Cluster's average movie rating

$C:$ Cluster corresponding to user $i$

$M:$ All movies

$\mathbb{1}(r_{i,j}):$ A function indiciating if user $i$ has rated movie $j$ 

$\gamma_j : \frac{\sum_{i\in C}{\mathbb{1}(r_{i,j})}}{\sum_{i\in C, k\in M}\mathbb{1}(r_{i,k})}$

$$P_{i, j} = \bar{u}_i + (\bar{m}_j - \bar{c})*\frac{\gamma_j}{\max_{k\in M}{\gamma_k}} $$



In [45]:
def KMeans_Predict(model, df):
    """
    Use a fitted KMeans clustering model to predict a user's movie rating
    
    Parameter:
        model (): Fitted KMeans Model
        df (Sparse DataFrame): user-movie rating sparse dataframe
    
    Returns:
        df (DataFrame): New Dataframe with NaNs filled with predicted ratings
        update_df (DataFrame): Dataframe for testing that includes predictions rather
                              than any actual ratings.
    """
    #Assign clusters
    df['cluster'] = model.labels_
    #Find the proportion of ratings given for each movie by cluster
    share_df = df.groupby(['cluster']).count()/df.groupby(['cluster']).count().sum(1).values.reshape((-1,1)) #SHARE
    #Find the average rating for each movie by cluster
    cluster_avgs_df = df.groupby(["cluster"]).mean()  #m_bar
    #Define the average movie rating over all movies by user
    df['avg'] = df.mean(1)                            #u_bar
    #Find the average movie rating over all movies by cluster
    cluster_avgs = df.groupby(["cluster"]).mean().avg #c_bar
    #Calculate the adjustment for predicting purposes
    diff_df = (cluster_avgs_df - cluster_avgs.values.values.reshape((-1,1)))*(share_df/share_df.max(0))
    #If no one in cluster has seen the movie, adjustment is zero
    diff_df = diff_df.fillna(0)
    #Make the diff_df same size as df
    update_df = df[['cluster']].reset_index().merge(diff_df, how='left', on='cluster').set_index('index')
    #Add the adjustment to each user's avg movie rating
    update_df = update_df.iloc[:,1:].apply(lambda x: x + df.avg)
    #Fill the NaNs with the predictions
    df.update(update_df, overwrite=False)
    
    return df.iloc[:,:-2], update_df

def Test_KMeans(updated, predictions):
    #Find the sum of squared residuals REPLACE WITH ANOTHER LOSS METRIC? SOFTMAX??
    residuals = updated - predictions
    return ((residuals)**2).sum().sum() / (residuals!=0).sum().sum()

def standardize(df):
    """
    Change Predictions to standard movie rating format:
        Only takes values 0.5, 1, 1.5, ..., 5.0
        
    Parameters:
        df(Sparse Dataframe): user-movie rating dataframe
        
    Returns:
        (Sparse Dataframe): Standardized user-movie rating dataframe
    """
    #Only pass in columns of movie_IDs to standardize movie ratings for
    #Wouldn't want to change clusters
    clipped = df.clip(.5, 5)
    return clipped.apply(lambda x: round(2*x)/2)
    

In [4]:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv", nrows = 10000)
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ratings.movieId.unique()), ordered=True)

row = ratings.userId.astype(user_c).cat.codes
col = ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ratings['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

In [5]:
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories) #,
                         #default_fill_value=2.5)
#Initialize KMeans model
##Kmeans can't be performed in SciKitLearn on data with missing values
##So we fill NA with 2.5 to find the clusters
KM_model = KMeans(n_clusters=5).fit(dfs.fillna(2.5))
KM_model.labels_
df = dfs.copy()

In [6]:
KM_model = KMeans(n_clusters=5).fit(dfs.fillna(2.5))
KM_model.labels_
df = dfs.copy()

In [20]:
new, test = KMeans_Predict(KM_model, df)
mse1 = Test_KMeans(new, test)
mse2 = Test_KMeans(standardize(new), standardize(test))


In [21]:
print("mean squared error:", mse1)
print('mean squared error of standardized:', mse2)

mean squared error: 0.018543372035064457
mean squared error of standardized: 0.6335435779816514


## KNN with cosine similarity

In [15]:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv", nrows = 10000)
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ratings.movieId.unique()), ordered=True)

row = ratings.userId.astype(user_c).cat.codes
col = ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ratings['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories) #,
                         #default_fill_value=2.5)

Cosine Similarity:

$$sim(u, v) = \frac{\sum_i{(r_{ui}-\bar{r_u})(r_{vi}-\bar{r_v})}}{\sqrt{\sum_i{(r_{ui}-\bar{r_u})^2}}\sqrt{\sum_i{(r_{vi}-\bar{r_v})^2}}}$$

In [16]:
def cosine_similarity(u, v):
    """
    Find the cosine similarity between two users given their 
        movie ratings.
        
    Parameters:
        u (Series): user 'u' movie ratings
        v (Series): user 'v' movie ratings
        
    Returns:
        (float): cosine similarity between u and v
    """
    #Demean the ratings
    #    then fill NaN values with 0
    #    this penalizes not seeing the same movies
    u_hat = (u-u.mean()).fillna(0)
    v_hat = (v-v.mean()).fillna(0)
    
    return np.sum(u_hat*v_hat)/(np.sqrt(np.sum(u_hat**2))*np.sqrt(np.sum(v_hat**2)))
    

Prediction using cosine similarity (Pham et. al):

$P_{i, j}:$ Prediction for user $i$'s rating of movie $j$

$r_{k,j}:$ User $k$'s rating of movie $j$ (Equal to $\bar{r}_k$ if user $k$ hasn't rated movie $j$)

$\bar{r}_i:$ User $i$'s average movie rating

$w_{i,k}:$ Cosine similarity between user $i$ and user $k$

$N_i:$ The set of user $i$'s neighbors

$$P_{i,j} = \bar{r}_i + \frac{\sum_{k\in N_i}{(r_{k,j}-\bar{r}_k)w_{i,k}}}{\sum_{k\in N_i}{\mid w_{i,k}\mid}}    $$


In [125]:
def KNN_Predict(df, k):
    """
    Peform K-Nearest Neighbors and use it to predict movie ratings
    
    Parameter:
        df (Sparse DataFrame): user-movie rating sparse dataframe
        k (int): Hyperparameter - the number of neighbors
    
    Returns:
        df (DataFrame): New Dataframe with NaNs filled with predicted ratings
        update_df (DataFrame): Dataframe for testing that includes predictions rather
                              than any actual ratings.
    """
    update_df = df.copy().to_dense()
    for user_id in df.index:
        user = df.loc[user_id]
        #Find the cosine similarity for 'user' with every other user
        similarities = df.apply(lambda row: cosine_similarity(row, user), axis=1)
        similarity_df = pd.DataFrame(data={"similarity": similarities, "idx": similarities.index})
        similarity_df.sort_values("similarity", ascending = False, inplace=True)
        #Find the k nearest neighbors
        neighbors = similarity_df.iloc[1:k+1].idx.values
        #Predict movie rating using eq. (4) from Pham et. al paper
        predict = user.mean() + ((df.loc[neighbors] - df.loc[neighbors].mean(1).values.reshape((-1,1))) * similarity_df.iloc[1:k+1]["similarity"].values.values.reshape((-1,1))).sum()/similarity_df.iloc[1:k+1]["similarity"].abs().sum()  
        update_df.loc[user_id] = predict
    
    df.update(update_df, overwrite=False)
    return df, update_df

In [90]:
df = dfs.copy()

In [126]:
new, test = KNN_Predict(df, 5)

In [44]:
new

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,111921,112138,112290,112556,112852,116797,117511,117590,118696,125916
1,4.042261,3.500000,3.558134,2.613690,3.419937,3.882921,3.648473,3.113690,3.613690,3.536767,...,4.613690,4.113690,5.113690,5.113690,4.613690,3.613690,3.113690,3.613690,5.113690,0.613690
2,4.277049,3.195423,4.000000,2.848477,3.654724,4.117708,3.883260,3.348477,3.848477,3.771554,...,4.848477,4.348477,5.348477,5.348477,4.848477,3.848477,3.348477,3.848477,5.348477,0.848477
3,4.000000,3.557599,3.806699,4.106383,3.894101,3.951173,3.761223,4.106383,4.106383,4.210909,...,4.106383,4.106383,4.106383,4.106383,4.106383,4.106383,4.106383,4.106383,4.106383,4.106383
4,3.826548,2.744922,3.342421,2.397977,3.204224,3.000000,3.432760,2.897977,3.397977,4.000000,...,4.397977,3.897977,4.897977,4.897977,4.397977,3.397977,2.897977,3.397977,4.897977,0.397977
5,4.548113,3.000000,4.063986,3.119541,3.925788,4.388772,4.154324,3.619541,4.119541,4.042618,...,5.119541,4.619541,5.619541,5.619541,5.119541,4.119541,3.619541,4.119541,5.619541,1.119541
6,5.000000,2.907681,3.000000,2.560735,3.366982,3.829966,5.000000,3.060735,3.560735,3.483812,...,4.560735,4.060735,5.060735,5.060735,4.560735,3.560735,3.060735,3.560735,5.060735,0.560735
7,3.594505,2.512879,3.000000,2.165934,2.972181,3.435165,3.000000,2.665934,3.165934,3.089011,...,4.165934,3.665934,4.665934,4.665934,4.165934,3.165934,2.665934,3.165934,4.665934,0.165934
8,4.000000,3.002329,5.000000,2.655383,3.461630,3.000000,3.690166,3.155383,3.655383,4.000000,...,4.655383,4.155383,5.155383,5.155383,4.655383,3.655383,3.155383,3.655383,5.155383,0.655383
9,3.337084,2.255459,2.852958,1.908513,2.714760,3.177744,2.943296,2.408513,2.908513,2.831590,...,3.908513,3.408513,4.408513,4.408513,3.908513,2.908513,2.408513,2.908513,4.408513,-0.091487
10,4.000000,3.073835,3.671334,2.726889,3.533136,3.996120,3.761672,3.226889,3.726889,3.649966,...,4.726889,4.226889,5.226889,5.226889,4.726889,3.726889,3.226889,3.726889,5.226889,0.726889


In [130]:
mse = Test_KMeans(standardize(new), standardize(test))
print("mean squared error:", mse)

mean squared error: 94.4532967032967


In [46]:
mse = Test_KMeans(new, test)
print("mean squared error:", mse)

mean squared error: 0.018543372035064457


## Selecting Hyperparameters