In [2]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from scipy.sparse import linalg as spla
from time import time
import timeit
from IPython.display import clear_output

In [3]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv")#, nrows = 10000)
movies = pd.read_csv(path+"movies.csv")#, nrows=10000)
genome_scores = pd.read_csv(path+"genome-scores.csv")#, nrows=10000)

In [4]:
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ratings.movieId.unique()), ordered=True)

row = ratings.userId.astype(user_c).cat.codes
col = ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ratings['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))
#Create sparse dataframe from user-movie matrix
rating_df = pd.SparseDataFrame(sparse_matrix, \
                             index=user_c.categories, \
                             columns=movie_c.categories)

In [7]:
rating_df.density * 100

0.539984781355445

Similarity between two movies using Pearson Correlation:

$r_{ui}:$ User $u$'s rating of movie $i$

$\bar{r_i}:$ Average movie rating for movie $i$

$$sim(i, j) = \frac{\sum_u{(r_{ui}-\bar{r_i})(r_{uj}-\bar{r_j})}}{\sqrt{\sum_u{(r_{ui}-\bar{r_i})^2}}\sqrt{\sum_u{(r_{uj}-\bar{r_j})^2}}}$$

Since there are no NaN values in the content-based-filtering approach, cosine similarity is equivalent to pearson correlation.

In [None]:
def cosine_similarity(u, v):
    """
    Find the cosine similarity between two users given their 
        movie ratings.
        
    Parameters:
        u (Series): user 'u' movie ratings
        v (Series): user 'v' movie ratings
        
    Returns:
        (float): cosine similarity between u and v
    """
    #Demean the ratings
    #    then fill NaN values with 0
    #    this penalizes not seeing the same movies
    u_hat = (u-u.mean()).fillna(0)
    v_hat = (v-v.mean()).fillna(0)
    
    return np.sum(u_hat*v_hat)/(np.sqrt(np.sum(u_hat**2))*np.sqrt(np.sum(v_hat**2)))
    

Prediction using cosine similarity (Pham et. al):

$P_{i, j}:$ Prediction for user $i$'s rating of movie $j$

$r_{i,k}:$ User $i$'s rating of movie $k$ (Equal to $\bar{r}_k$ if user $k$ hasn't rated movie $j$)

$\bar{r}_k:$ Average rating for movie $k$

$w_{j,k}:$ Cosine similarity between movie $j$ and movie $k$

$N_j:$ The set of movie $j$'s neighbors

$$P_{i,j} = \bar{r}_i + \frac{\sum_{k\in N_j}{(r_{i,k}-\bar{r}_k)w_{j,k}}}{\sum_{k\in N_j}{\mid w_{j,k}\mid}}    $$



In [None]:
def KNN_Predict(df, rating_df, k):
    """
    Peform K-Nearest Neighbors and use it to predict movie ratings
    
    Parameter:
        df (DataFrame): user-movie rating sparse dataframe
        rating_df (Sparse DataFrame):
        k (int): Hyperparameter - the number of neighbors
    
    Returns:
        df (DataFrame): New Dataframe with NaNs filled with predicted ratings
        update_df (DataFrame): Dataframe for testing that includes predictions rather
                              than any actual ratings.
    """
    update_df = rating_df.copy()#.to_dense()
    start = timeit.default_timer()
    for i, movie_id in enumerate(df.index):
        clear_output(wait=True)
        movie = df.loc[movie_id]
        #Find the cosine similarity for 'user' with every other user
        similarities = df.apply(lambda row: cosine_similarity(row, movie), axis=1)
        similarity_df = pd.DataFrame(data={"similarity": similarities, "idx": similarities.index})
        similarity_df.sort_values("similarity", ascending = False, inplace=True)
        #Find the k nearest neighbors
        neighbors = similarity_df.iloc[1:k+1].idx.values
             
        predict = rating_df.mean(1) + ((rating_df.reindex(neighbors, axis='columns') - rating_df.reindex(neighbors, axis='columns').mean(0))*similarity_df.iloc[1:k+1]["similarity"].values).sum(1)/similarity_df.iloc[1:k+1]["similarity"].abs().sum()         
        update_df.loc[:,movie_id] = predict
        
        stop = timeit.default_timer()
        
        if (i/len(df.index)*100) < 5:
            expected_time = "Calculating..."
        else:
            time_perc = timeit.default_timer()
            expected_time = np.round(((time_perc - start) / (i/len(df.index)))/60,2)
        
        print("Current progress: ", np.round(i/len(df.index) * 100, 2), "%", sep="")
        print("Current run time:", np.round((stop - start)/60,2), "minutes")
        print("Expected Run Time:", expected_time, "minutes")
    
    rating_df.update(update_df, overwrite=False)
    return df, update_df

Method for predicting user $i$'s movie rating for movie $j$:

$P_{i, j}:$ Prediction for user $i$'s rating of movie $j$

$\bar{r}_i:$ Average rating for user $i$ if $i$ represents a user

$\bar{r}_k:$ Average rating for movie $k$ if $k$ represents a movie

$N_j:$ The neighbors in movie $j$'s cluster

$\mid N_j\mid :$ Number of neighbors in movie $j$'s cluster

$M:$ All movies

$\mathbb{1}(r_{i,k}):$ A function indiciating if user $i$ has rated movie $k$ 

$\gamma_j : \frac{\sum_{k\in N_j}{\mathbb{1}(r_{i,k})}}{\mid N_j \mid}$

$$P_{i, j} = \bar{r}_i + \frac{\sum_{k\in N_j}{(r_{i,k} - \bar{r}_k)}}{\mid N_j \mid}*\frac{\gamma_j}{\max_{k\in M}{\gamma_k}} $$

Change to average movie rating by cluster instead of the current difference?




In [None]:
def KMeans_Predict(model, df, rating_df):
    """
    Use a fitted KMeans clustering model to predict a user's movie rating
    
    Parameter:
        model (): Fitted KMeans Model
        df (Sparse DataFrame): user-movie rating sparse dataframe
    
    Returns:
        df (DataFrame): New Dataframe with NaNs filled with predicted ratings
        update_df (DataFrame): Dataframe for testing that includes predictions rather
                              than any actual ratings.
    """
    #Assign clusters
    df['cluster'] = model.labels_
    update_df = rating_df.copy()#.to_dense()
    start = timeit.default_timer()
    for i, movie_id in enumerate(df.index):
        clear_output(wait=True)
        neighbors = df.groupby(['cluster']).groups[df.loc[movie_id].cluster]
        update = (rating_df.reindex(neighbors, axis='columns') - rating_df.reindex(neighbors, axis='columns').mean()).mean(1)
        share = rating_df.reindex(neighbors, axis='columns').count(1)/len(neighbors)
        update = df.mean(1) + update*(share/share.max())
        update_df.loc[:,movie_id] = update
        
        stop = timeit.default_timer()
        
        if (i/len(df.index)*100) < 5:
            expected_time = "Calculating..."
        else:
            time_perc = timeit.default_timer()
            expected_time = np.round(((time_perc - start) / (i/len(df.index)))/60,2)
        
        print("Current progress: ", np.round(i/len(df.index) * 100, 2), "%", sep="")
        print("Current run time:", np.round((stop - start)/60,2), "minutes")
        print("Expected Run Time:", expected_time, "minutes")
        
    rating_df.update(update_df, overwrite=False)
    
    return rating_df, update_df

## Using Genome Scores

In [None]:
genome_scores.head()

In [None]:
#Generate movie-tag matrix with relevance scores as its values
movie_c = CategoricalDtype(sorted(genome_scores.movieId.unique()), ordered=True)
tag_c = CategoricalDtype(sorted(genome_scores.tagId.unique()), ordered=True)

row = genome_scores.movieId.astype(movie_c).cat.codes
col = genome_scores.tagId.astype(tag_c).cat.codes
sparse_matrix = csr_matrix((genome_scores['relevance'], (row, col)), \
                           shape=(movie_c.categories.size, tag_c.categories.size))
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=movie_c.categories, \
                         columns=tag_c.categories)
genome_df = dfs.to_dense()

### KNN

In [None]:
new_KNN_1, test_KNN_1 = KNN_Predict(genome_df, rating_df, 10)

### KMeans

In [None]:
KM_model = KMeans(n_clusters=5).fit(genome_df)
new_KMeans_1, test_KMeans_1 = KMeans_Predict(KM_model, genome_df, rating_df)

## Using Genres

In [None]:
movies.head()

In [None]:
#Perform one-hot encoding on genres
genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", 
          "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", 
          "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
for genre in genres:
    movies[genre] = 1*(movies["genres"].str.contains(genre))
movie_df = movies.set_index("movieId").iloc[:,2:].copy()

### KNN

In [None]:
new_KNN_2, test_KNN_2 = KNN_Predict(movie_df, rating_df, 10)

### KMeans

In [None]:
KM_model = KMeans(n_clusters=50).fit(movie_df)

In [None]:
new_KMeans_2, test_KMeans_2 = KMeans_Predict(KM_model, movie_df, rating_df)