# Mod4 Project

## Import Libraries

In [57]:

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut


# pd.set_option("display.max_rows", None, "display.max_columns", None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Import data files
movies_df = pd.read_csv('movies.csv',index_col=False)
ratings_df = pd.read_csv('ratings.csv',index_col=False)
tags_df = pd.read_csv('tags.csv',index_col=False)

First thing is to look at each file to get an idea of what is contained. Then we will merge any relavent data sets and perform some EDA

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


We're going to drop the timestamp column as it is not useful for our model

In [14]:
ratings_df.drop(columns='timestamp',inplace=True)

In [5]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:

movies_df.set_index('movieId')

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
df = pd.merge(movies_df,tags_df, on='movieId', how='left')

In [8]:
tags_df.shape

(3683, 4)

In [9]:
ratings_df.shape

(100836, 4)

In [10]:
df.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [11]:
df.shape

(11853, 6)

## EDA

## Collaborative Model

The first thing we are going to do is build some baseline Collaborative Recommendation models. We will begin with Surprise and loading in our data. We'll first look at RMSE for evaluation but then expand to other metrics.

In [16]:

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df, reader)

Let's look at how many users and items we have in our dataset. If using neighborhood-based methods, this will help us determine whether or not we should perform user-user or item-item similarity

In [17]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [49]:
#Creating SVD Model and showing RMSE
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.868973528326013, 'mae': 0.6682495745722845}
{'rmse': {'n_factors': 50, 'reg_all': 0.05}, 'mae': {'n_factors': 50, 'reg_all': 0.05}}


In [47]:
sim_metrics = ['pearson','MSD','cosine','pearson_baseline']
#Creating Basic KNN Model and showing RMSE, checking all distance metrics
for metric in sim_metrics:
    knn_basic = KNNBasic(sim_options={'name': metric, 'user_based':True})
    cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)
    print('Similarity Metric = ', metric,'---', 'test_rmse = ', np.mean(cv_knn_basic['test_rmse']),'\n')
    
    

Similarity Metric =  pearson --- test_rmse =  0.9721466915905544 

Similarity Metric =  MSD --- test_rmse =  0.9454790111648448 

Similarity Metric =  cosine --- test_rmse =  0.971750865890526 

Similarity Metric =  pearson_baseline --- test_rmse =  0.9707781381764551 



In [48]:
# cross validating with KNNBaseline
for metric in sim_metrics:
    knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
    cv_knn_baseline = cross_validate(knn_baseline,data)
    print('Similarity Metric = ', metric,'---', 'test_rmse = ', np.mean(cv_knn_baseline['test_rmse']),'\n')

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Similarity Metric =  pearson --- test_rmse =  0.8780708372168192 

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matri

### Trying hit rate as an evaluation

In [54]:
trainSet, testSet = train_test_split(data, test_size=.25, random_state=0)

In [58]:
def GetTopN(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN
    
LOOCV = LeaveOneOut(n_splits=1, random_state=1)



In [59]:
svd_model=SVD(n_factors=50,reg_all=0.05)
for trainSet, testSet in LOOCV.split(data):
    # Train model without left-out ratings
    svd_model.fit(trainSet)
    # Predicts ratings for left-out ratings only
    leftOutPredictions = svd_model.test(testSet)
    # Build predictions for all ratings not in the training set
    bigTestSet = trainSet.build_anti_testset()
    allPredictions = svd_model.test(bigTestSet)
    # Compute top 10 recs for each user
    topNPredicted = GetTopN(allPredictions, n=10)

NameError: name 'defaultdict' is not defined

In [None]:
def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0

 # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1

        total += 1

    # Compute overall precision
    return hits/total
print("\nHit Rate: ", HitRate(topNPredicted, leftOutPredictions))