Team members:  

XIAOZHU MA (xiaozhu3)  
FAN YANG (fanyang3)


In [37]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic, KNNWithZScore, KNNWithMeans, accuracy, SVD
from surprise.model_selection import KFold, GridSearchCV, RandomizedSearchCV

import numpy as np
from IPython.display import Image

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)

In [3]:
df_movie = pd.read_csv('movies.csv')
df_rating = pd.read_csv('ratings.csv')
df_users =  pd.read_csv('users.csv')

### System I: recommendation based on genres

#### How does our recommendation scheme work?
Our algorithms recommend movies based on current reviews which provide the same results for all users. No user input is needed
 - scheme 1: recommend top 5 movies that have most reviews
 - scheme 2: recommend top 5 movies that have the highest average rating score

In [4]:
# get # of reviews and average review for each movie
grp_rating = df_rating.groupby('MovieID').agg({'Rating': ['count', 'mean'] })
grp_rating.columns = grp_rating.columns.droplevel(0)
grp_rating = grp_rating.reset_index()
grp_rating = grp_rating.rename(columns = {'count': 'rating_ct', 'mean': 'avg_rating'})

In [5]:
# merge to movie to get genre info
df_movie_rating = df_movie.merge(grp_rating, 'left', on = 'MovieID')
# get top 5 by each genre
genre_list = ["Action", "Adventure", "Animation",
               "Children's", "Comedy", "Crime",
               "Documentary", "Drama", "Fantasy",
               "Film-Noir", "Horror", "Musical",
               "Mystery", "Romance", "Sci-Fi",
               "Thriller", "War", "Western"]
genre_summary_list = []
for genre in genre_list:
    df_movie_rating[genre] = df_movie_rating['Genres'].apply(lambda x: 1 if genre in x else 0)
    top5_rating_ct_id = df_movie_rating[df_movie_rating[genre] == 1].sort_values('rating_ct', ascending=False)['MovieID'].iloc[:5,].tolist()
    top5_rating_id = df_movie_rating[df_movie_rating[genre] == 1].sort_values('avg_rating', ascending=False)['MovieID'].iloc[:5,].tolist()
    top5_rating_ct_name = df_movie_rating[df_movie_rating[genre] == 1].sort_values('rating_ct', ascending=False)['Title'].iloc[:5,].tolist()
    top5_rating_name = df_movie_rating[df_movie_rating[genre] == 1].sort_values('avg_rating', ascending=False)['Title'].iloc[:5,].tolist()
    summary = {
        'Genres': genre,
        'Top5_most_rating_id': top5_rating_ct_id,
        'Top5_highest_rating_id': top5_rating_id,
        'Top5_most_rating_name': top5_rating_ct_name,
        'Top5_highest_rating_name': top5_rating_name
    }
    genre_summary_list.append(summary)

df_top5 = pd.DataFrame.from_dict(genre_summary_list)

#### Top 5 most reviewed movies by genre

In [17]:
df_top5[['Genres', 'Top5_most_rating_id', 'Top5_most_rating_name']]

Unnamed: 0,Genres,Top5_most_rating_id,Top5_most_rating_name
0,Action,"[260, 1196, 1210, 480, 2028]","[Star Wars: Episode IV - A New Hope (1977), Star Wars: Episode V - The Empire Strikes Back (1980..."
1,Adventure,"[260, 1196, 1210, 480, 1580]","[Star Wars: Episode IV - A New Hope (1977), Star Wars: Episode V - The Empire Strikes Back (1980..."
2,Animation,"[1, 2987, 2355, 3114, 588]","[Toy Story (1995), Who Framed Roger Rabbit? (1988), Bug's Life, A (1998), Toy Story 2 (1999), Al..."
3,Children's,"[1097, 1, 34, 919, 2355]","[E.T. the Extra-Terrestrial (1982), Toy Story (1995), Babe (1995), Wizard of Oz, The (1939), Bug..."
4,Comedy,"[2858, 1270, 1580, 2396, 1197]","[American Beauty (1999), Back to the Future (1985), Men in Black (1997), Shakespeare in Love (19..."
5,Crime,"[608, 1617, 858, 296, 50]","[Fargo (1996), L.A. Confidential (1997), Godfather, The (1972), Pulp Fiction (1994), Usual Suspe..."
6,Documentary,"[2064, 246, 162, 3007, 1147]","[Roger & Me (1989), Hoop Dreams (1994), Crumb (1994), American Movie (1999), When We Were Kings ..."
7,Drama,"[2858, 1196, 2028, 593, 608]","[American Beauty (1999), Star Wars: Episode V - The Empire Strikes Back (1980), Saving Private R..."
8,Fantasy,"[260, 1097, 2628, 2174, 2797]","[Star Wars: Episode IV - A New Hope (1977), E.T. the Extra-Terrestrial (1982), Star Wars: Episod..."
9,Film-Noir,"[1617, 541, 2987, 1252, 913]","[L.A. Confidential (1997), Blade Runner (1982), Who Framed Roger Rabbit? (1988), Chinatown (1974..."


#### Top 5 highest rated movies by genre

In [19]:
df_top5[['Genres', 'Top5_highest_rating_id', 'Top5_highest_rating_name']]

Unnamed: 0,Genres,Top5_highest_rating_id,Top5_highest_rating_name
0,Action,"[2905, 2019, 858, 1198, 260]","[Sanjuro (1962), Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954), Godfather,..."
1,Adventure,"[3172, 2905, 1198, 260, 1204]","[Ulysses (Ulisse) (1954), Sanjuro (1962), Raiders of the Lost Ark (1981), Star Wars: Episode IV ..."
2,Animation,"[745, 1148, 720, 1223, 3429]","[Close Shave, A (1995), Wrong Trousers, The (1993), Wallace & Gromit: The Best of Aardman Animat..."
3,Children's,"[919, 3114, 1, 2761, 1023]","[Wizard of Oz, The (1939), Toy Story 2 (1999), Toy Story (1995), Iron Giant, The (1999), Winnie ..."
4,Comedy,"[3233, 1830, 3607, 745, 1148]","[Smashing Time (1967), Follow the Bitch (1998), One Little Indian (1973), Close Shave, A (1995),..."
5,Crime,"[3656, 858, 50, 3517, 3435]","[Lured (1947), Godfather, The (1972), Usual Suspects, The (1995), Bells, The (1926), Double Inde..."
6,Documentary,"[3881, 787, 3338, 2930, 128]","[Bittersweet Motel (2000), Gate of Heavenly Peace, The (1995), For All Mankind (1989), Return wi..."
7,Drama,"[3382, 989, 3607, 3245, 53]","[Song of Freedom (1936), Schlafes Bruder (Brother of Sleep) (1995), One Little Indian (1973), I ..."
8,Fantasy,"[260, 792, 1097, 247, 1073]","[Star Wars: Episode IV - A New Hope (1977), Hungarian Fairy Tale, A (1987), E.T. the Extra-Terre..."
9,Film-Noir,"[922, 3435, 913, 1252, 1267]","[Sunset Blvd. (a.k.a. Sunset Boulevard) (1950), Double Indemnity (1944), Maltese Falcon, The (19..."


### System II: collaborative recommendation system
For both algorithms: 
- we selected RMSE as the evaluation metrics
- the review data was splited to 10 folds, 9 folds were used as training and the remaining one fold was used as test. So we iterated 10 times with 90% data as training and 10% as test data each time
- due to the large sample size and long process time, we only tuned the parameters in the first iteration. The selected parameters were used in all the 10 iterations

#### Algorithm 1: item-based CF
- We applied KNNwithMeans algorithm from python package Surprise. The algorithm takes into account the mean of all ratings to each item. So the rating is normazlied by mean rating for each item.
- The number of neighbors were selected through 3 fold cross-validation. The best k selected was 50.
- Cosine similarity was used in the algorithm.
- The prediction is weighted by similarity value.
- In prediciton, if any user or item is missing in the training, then the prediction is set as global mean of all ratings.

In [22]:
# read data to surprise
reader = Reader(rating_scale=(1, 5))
rating_data = Dataset.load_from_df(df_rating[['UserID', 'MovieID', 'Rating']], reader)

In [23]:
# split data to 10 folds
fold = 10
kf = KFold(n_splits=fold)

##### Select best k in first iteration

In [24]:
i = 1
for trainset, testset in kf.split(rating_data):
    # use first fold to do parameter tuning
    if i<=1:
        param_grid = {'k': [10, 50, 100],
                      'sim_options': {'name': ['cosine'],
                                      'user_based': [False]}
                      }
        gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=3)
        gs.fit(rating_data)
        i += 1

df_cv = pd.DataFrame.from_dict(gs.cv_results)
best_k = gs.best_params['rmse']['k']
print(best_k)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
50


##### Run 10 iterations using the selected k

In [25]:
sim_options = {'name': 'cosine',
               'user_based': False # compute  similarities between items
               }

algo = KNNWithMeans(k = best_k, sim_options = sim_options)

accuracy_list = []
for trainset, testset in kf.split(rating_data):

        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        current_acc = accuracy.rmse(predictions, verbose=True)
        accuracy_list.append(current_acc)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8902
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8915
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8892
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8942
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8824
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8897
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8876
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8873
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8900
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8903


##### Test RMSE of 10 iterations

In [30]:
df_accuracy = pd.DataFrame.from_dict(accuracy_list)
df_accuracy.columns = ['TEST_RMSE']
df_accuracy

Unnamed: 0,TEST_RMSE
0,0.890239
1,0.891516
2,0.889219
3,0.894242
4,0.882414
5,0.889701
6,0.887558
7,0.887265
8,0.890021
9,0.890343


#### Algorithm 2: SVD
- We applied SVD algorithm from python package Surprise. The prediction and optimization functions are below (source: Surprise document)
<img src="target_function.PNG" width=600 height=400>   

where
    
$r_{ui}$ is true/estimated ating of user $u$ for item $i$  
$u$ is the mean of all ratings  
$b_{u}$ is the user biases  
$b_{i}$ is item biasees  
$p_{u}$ is the user factor  
$q_{i}$ is the item factor 

    

- Parameters we tuned in the algorithm are:
    - n_factors: the number of factors in SVD  
    - n_epochs: the number of iteration of the SGD procedure  
    - lr_all: the learning rate for all parameters  
    - reg_all: the regularization term for all parameters

##### parameter tuning

In [44]:
fold = 10
kf = KFold(n_splits=fold)

# parameter tuning
i = 1
for trainset, testset in kf.split(rating_data):
    # use first fold to do parameter tuning
    if i<=1:

        param_grid = {'n_factors': [50, 100],
                      'lr_all': [0.002, 0.005, 0.2],
                      'reg_all': [0.02, 0.2],
                      'n_epochs': [10, 20]
                      }
        gs = RandomizedSearchCV(SVD, param_grid, n_iter=10, measures=['rmse'], cv=3)
        gs.fit(rating_data)
        i += 1

df_cv_result = pd.DataFrame.from_dict(gs.cv_results)
best_factor = gs.best_params['rmse']['n_factors']
best_lr = gs.best_params['rmse']['lr_all']
best_reg = gs.best_params['rmse']['reg_all']
best_epoch = gs.best_params['rmse']['n_epochs']


In [45]:
gs.best_params

{'rmse': {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.02, 'n_epochs': 20}}

##### Run 10 iterations using the selected parameters

In [46]:
algo = SVD(n_factors = best_factor,  lr_all= best_lr, reg_all = best_reg, n_epochs = best_epoch)
SVD_accuracy_list = []
for trainset, testset in kf.split(rating_data):
        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        current_acc = accuracy.rmse(predictions)
        SVD_accuracy_list.append(current_acc)

RMSE: 0.8605
RMSE: 0.8662
RMSE: 0.8640
RMSE: 0.8650
RMSE: 0.8656
RMSE: 0.8610
RMSE: 0.8627
RMSE: 0.8624
RMSE: 0.8631
RMSE: 0.8622


##### Test RMSE of 10 iterations

In [47]:
df_accuracy_svd = pd.DataFrame.from_dict(SVD_accuracy_list)
df_accuracy_svd.columns = ['TEST_RMSE']
df_accuracy_svd

Unnamed: 0,TEST_RMSE
0,0.860533
1,0.866249
2,0.863996
3,0.864996
4,0.865613
5,0.86105
6,0.862664
7,0.862379
8,0.86308
9,0.86218
