**Project Title: Movie Recommendation System using a Hybrid-Filtering Approach**

In [None]:
 #Importing Libraries
import pandas as pd
import numpy as np
#For content
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import matplotlib.pyplot as plt

**Data Pre-Processing**

In [None]:
#Reading the datasets
movies = pd.read_csv('/content/movies.csv')
ratings = pd.read_csv('/content/ratings.csv')
tags = pd.read_csv('/content/tags.csv')

In [None]:
#Shape of the datasets
print('movies: ', movies.shape)
print('ratings: ', ratings.shape)
print('tags: ', tags.shape)

movies:  (9742, 3)
ratings:  (100836, 4)
tags:  (3683, 4)


In [None]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
#Merging the movies and ratings datasets
data = pd.merge(ratings, movies, on='movieId' , how='left')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [None]:
data['genres'] = data['genres'].str.split('|')

In [None]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,964981247,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,964982224,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,964982931,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"


In [None]:
#Data Cleaning
data = data.drop('title', axis=1)
tags.drop('timestamp', axis=1, inplace=True)

In [None]:
#Splitting tags
tags['tag'] = tags['tag'].str.split('|')

In [None]:
#Here, we group the tags of a userID with its respective movieID
tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()
tags.head(5)

Unnamed: 0,userId,movieId,tag
0,2,60756,"['funny'],['Highly quotable'],['will ferrell']"
1,2,89774,"['Boxing story'],['MMA'],['Tom Hardy']"
2,2,106782,"['drugs'],['Leonardo DiCaprio'],['Martin Scors..."
3,7,48516,['way too long']
4,18,431,"['Al Pacino'],['gangster'],['mafia']"


In [None]:
#Merging the tags dataset with our dataset containing movies and ratings
data = pd.merge(data, tags, on=['userId','movieId'], how='left')

In [None]:
data.shape

(100836, 6)

In [None]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"[Adventure, Animation, Children, Comedy, Fantasy]",
1,1,3,4.0,964981247,"[Comedy, Romance]",
2,1,6,4.0,964982224,"[Action, Crime, Thriller]",
3,1,47,5.0,964983815,"[Mystery, Thriller]",
4,1,50,5.0,964982931,"[Crime, Mystery, Thriller]",


In [None]:
data['tag'] = data['tag'].apply(lambda d: d if isinstance(d, list) else [])
data['genres'] = data['genres'].apply(lambda d: d if isinstance(d, list) else [])

In [None]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"[Adventure, Animation, Children, Comedy, Fantasy]",[]
1,1,3,4.0,964981247,"[Comedy, Romance]",[]
2,1,6,4.0,964982224,"[Action, Crime, Thriller]",[]
3,1,47,5.0,964983815,"[Mystery, Thriller]",[]
4,1,50,5.0,964982931,"[Crime, Mystery, Thriller]",[]


**Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data.userId)

In [None]:
train_data = train_data.sort_values(['userId', 'movieId'])

In [None]:
test_data = test_data.sort_values(['userId','movieId'])

In [None]:
 #Here, we save the datasets into csv files
 train_data.to_csv('training_data.csv', index = False)
 test_data.to_csv('testing_data.csv', index = False)

In [None]:
#Here, we pre-process the movie data
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].apply(lambda d: d if isinstance(d, list) else [])
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [None]:
movies.to_csv('movies.csv', index = False)

**Content-Based Filtering**

In order to create a customer vector based on the content, this method investigates the genre and rating linked to the movie and the customer. This customer vector is used to produce the suggestions.

In [None]:
# Movies data gets loaded
movies = pd.read_csv("movies.csv", converters={"genres": literal_eval})
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [None]:
# Training data gets loaded
ratings_train = pd.read_csv("training_data.csv", converters={"genres": literal_eval, "tag": literal_eval})

ratings_train.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,3,4.0,964981247,"[Comedy, Romance]",[]
1,1,6,4.0,964982224,"[Action, Crime, Thriller]",[]
2,1,47,5.0,964983815,"[Mystery, Thriller]",[]
3,1,50,5.0,964982931,"[Crime, Mystery, Thriller]",[]
4,1,70,3.0,964982400,"[Action, Comedy, Horror, Thriller]",[]


In [None]:
#movies['tags'] = movies['tags'].fillna('')
#movies['description'] = movies['overview'] + movies['tagline']
#movies['description'] = movies['description'].fillna('')

In [None]:
# Unique genres
genre_unique = movies['genres'].explode().unique()

# Count of each genre
genre_count = ratings_train['genres'].explode().value_counts()

# Making a dicionary by assigning an index to a genre
genre_dict = {k: v for v, k in enumerate(genre_unique)}
genre_dict

{'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [None]:
movies['movie_vector'] = ""
for ind, row in movies.iterrows():
    genres = row.genres
    movie_vector = np.zeros(len(genre_dict))

    for g in genres:
        movie_vector[genre_dict[g]] = 1
    movies.at[ind, 'movie_vector'] = movie_vector

# Saving the final dataframe
movies.to_csv("movie_vector.csv")

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,movie_vector
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
4,5,Father of the Bride Part II (1995),[Comedy],"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
#Personalising based on individual user
user_total = ratings_train['userId'].unique()
u_data = pd.DataFrame(columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])

In [None]:
for user_id in user_total:
    u_rating_data = ratings_train[(ratings_train['userId'] == user_id)]

    user_vector = np.zeros(len(genre_dict))
    count_vector = np.zeros(len(genre_dict))

    user_avg_rating = 0
    movies_rated_count = 0

    for _, row in u_rating_data.iterrows():
        user_avg_rating += row.rating
        movies_rated_count += 1
        genres = row.genres

        user_movie_vector = np.zeros(len(genre_dict))

        for g in genres:
            user_movie_vector[genre_dict[g]] = 1
            count_vector[genre_dict[g]] += 1

        user_vector += user_movie_vector*row.rating

    count_vector = np.where(count_vector==0, 1, count_vector)
    user_vector = np.divide(user_vector, count_vector)
    user_avg_rating /= movies_rated_count
    row_df = pd.DataFrame([[user_id, user_vector, user_avg_rating, movies_rated_count]],
                          columns=['userId', 'user_vector', 'avg_rating', 'num_movies_rated'])
    u_data = pd.concat([u_data, row_df], ignore_index=True)

In [None]:
#Saving user data into a dataframe
u_data.to_csv("user_info.csv")

In [None]:
u_data.head()

Unnamed: 0,userId,user_vector,avg_rating,num_movies_rated
0,1,"[4.367647058823529, 4.76, 4.6, 4.3030303030303...",4.392473,186
1,2,"[4.5, 0.0, 0.0, 4.0, 0.0, 4.5, 4.0, 4.0, 4.1, ...",4.043478,23
2,3,"[2.611111111111111, 0.5, 0.5, 0.5, 3.333333333...",2.209677,31
3,4,"[3.619047619047619, 4.0, 3.75, 3.4756097560975...",3.514451,173
4,5,"[3.0, 4.2, 4.0, 3.5, 4.0, 3.3333333333333335, ...",3.657143,35


In [None]:
ratings_test = pd.read_csv("testing_data.csv", converters={"genres": literal_eval, "tag": literal_eval})
ratings_test.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"[Adventure, Animation, Children, Comedy, Fantasy]",[]
1,1,260,5.0,964981680,"[Action, Adventure, Sci-Fi]",[]
2,1,316,3.0,964982310,"[Action, Adventure, Sci-Fi]",[]
3,1,362,5.0,964982588,"[Adventure, Children, Romance]",[]
4,1,423,3.0,964982363,"[Action, Thriller]",[]


In [None]:
ratings_test.iloc[0]

userId                                                       1
movieId                                                      1
rating                                                     4.0
timestamp                                            964982703
genres       [Adventure, Animation, Children, Comedy, Fantasy]
tag                                                         []
Name: 0, dtype: object

In [None]:
u_rating_data[u_rating_data['movieId']==1]

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
79627,610,1,5.0,1479542900,"[Adventure, Animation, Children, Comedy, Fantasy]",[]


In [None]:
genres = u_rating_data[u_rating_data['movieId']==1].genres.values[0]
vector = np.zeros(len(genre_dict))
for g in genres:
    vector[genre_dict[g]] = 1
print(vector)

[1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
u_data[u_data['userId']==1].user_vector[0]

array([4.36764706, 4.76      , 4.6       , 4.3030303 , 4.29411765,
       4.26315789, 4.53571429, 4.34782609, 4.36585366, 4.125     ,
       3.35714286, 4.17647059, 4.25925926, 4.73333333, 4.75      ,
       0.        , 0.        , 4.5       , 5.        , 0.        ])

In [None]:
x = vector*u_data[u_data['userId']==1].user_vector[0]
np.nanmean(np.where(x!=0,x,np.nan))

4.464959001782531

In [None]:
contentbased_predictions = pd.DataFrame(columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])
for ind, row in ratings_test.iterrows():
    userId = row['userId']
    movieId = row['movieId']
    og_rating = row['rating']

    try:
        user_vector = u_data[u_data['userId'] == int(userId)].user_vector.values[0]
        movie_vector = movies[movies['movieId'] == int(movieId)].movie_vector.values[0]

        predicted_rating = user_vector*movie_vector

        if predicted_rating.any():
            predicted_rating = np.nanmean(np.where(predicted_rating!=0, predicted_rating, np.nan))

        else:
            predicted_rating = 0

        row_df = pd.DataFrame([[userId, movieId, user_vector, movie_vector, og_rating, predicted_rating]],
                    columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])
        contentbased_predictions = pd.concat([contentbased_predictions, row_df], ignore_index=True)
    except:
        print("User not found: ", userId)

In [None]:
#Calculating RMSE
rmse = ((contentbased_predictions.og_rating - contentbased_predictions.pred_rating) ** 2).mean() ** .5
print(rmse)

0.9185843165499578


In [None]:
#Calculating MAE
mae = (((contentbased_predictions.og_rating - contentbased_predictions.pred_rating) ** 2) ** .5).mean()
mae

0.7079421459753336

**KNN Analysis (CF)**

Here, I have provided a contrast between user-user based collaborative filtering with item-item based collaborative filtering.

In [None]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 7.4 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626480 sha256=3cd4292935d59153be7c7e846f93753158cc16b1eb929b03e95bc55c593036f4
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
from surprise import SVD, BaselineOnly, SVDpp
from surprise import Dataset
from surprise.prediction_algorithms import KNNBaseline
from surprise import accuracy
from surprise.model_selection import train_test_split

In [None]:
# Here, I loaded the Movielens-1M dataset directly from the website to have more data
data = Dataset.load_builtin('ml-1m')

Dataset ml-1m could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-1m.zip...
Done! Dataset ml-1m has been saved to /root/.surprise_data/ml-1m


In [None]:
#Train-Test Split
trainset, testset = train_test_split(data, test_size=.20)

In [None]:
#Here, a function for the Collaborative Filtering Algorithm has been defined.
def recommendation(CFalgo, trainset, testset):
  CFalgo.fit(trainset)
  predictions = CFalgo.test(testset)
  #RMSE
  accuracy.rmse(predictions)
  #MAE
  accuracy.mae(predictions)
  return

In [None]:
# CF algorithm whihc takes a baseline rating
#Item-Item Based
similarity = {'name': 'cosine',
               'user_based': False
               }
CFalgo = KNNBaseline(sim_options=similarity)

CFalgo.fit(trainset)
predictions = CFalgo.test(testset)

accuracy.rmse(predictions)
accuracy.mae(predictions)
print("Done!")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8953
MAE:  0.7038
Done!


In [None]:
#User-User based
similarity = {'name': 'cosine'}
CFalgo = KNNBaseline(similarity=similarity)
recommendation(CFalgo, trainset, testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8966
MAE:  0.7071


**Generating predictions**

In [None]:
def traintest_conversion_for_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
def recommendation(CFalgo, trainset, testset):
    CFalgo.fit(trainset)

#Here, we make predictions on testing
    test_predictions = CFalgo.test(testset)
    test_rmse = accuracy.rmse(test_predictions)
    test_mae = accuracy.mae(test_predictions)

    return test_rmse, test_mae, test_predictions

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = traintest_conversion_for_surprise(traindf, testdf)

In [None]:
#1
CFalgo = BaselineOnly()
test_base_rmse, test_base_mae, test_base_pred = recommendation(CFalgo, trainset, testset)

Estimating biases using als...
RMSE: 0.8667
MAE:  0.6684


In [None]:
#2
CFalgo = KNNBaseline(similarity=similarity)
test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(CFalgo, trainset, testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8702
MAE:  0.6651


In [None]:
# 3
CFalgo = SVD()
test_svd_rmse, test_svd_mae, test_svd_pred = recommendation(CFalgo, trainset, testset)

2
RMSE: 0.8726
MAE:  0.6693


In [None]:
#4
CFalgo = SVDpp()
test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(CFalgo, trainset, testset)

In [None]:
test_pred_df = pd.DataFrame(
    columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating'])
test_svd_df = pd.DataFrame(
    columns=['uid', 'iid', 'og_rating', 'est_rating'])
test_svdpp_df = pd.DataFrame(
   columns=['uid', 'iid', 'og_rating', 'est_rating'])
test_knnb_df = pd.DataFrame(
    columns=['uid', 'iid', 'og_rating', 'est_rating'])

num_test = len(test_base_pred)
for i in range(num_test):
    svd = test_svd_pred[i]
    knn = test_knn_pred[i]
    svdpp = test_svdpp_pred[i]
    df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est]],
                      columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating'])
    df_svd = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est]],
                          columns=['uid', 'iid', 'og_rating', 'est_rating'])
    df_svdpp = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svdpp.est]],
                          columns=['uid', 'iid', 'og_rating', 'est_rating'])
    df_knnb = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, knn.est]],
                           columns=['uid', 'iid', 'og_rating', 'est_rating'])

    test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)
    test_svd_df = pd.concat([df_svd, test_svd_df], ignore_index=True)
    test_svdpp_df = pd.concat([df_svdpp, test_svdpp_df], ignore_index=True)
    test_knnb_df = pd.concat([df_knnb, test_knnb_df], ignore_index=True)

In [None]:
#Making csvs
test_pred_df.to_csv('test_prediction_HP.csv')
test_svd_df.to_csv('test_predictions_svd.csv')
test_svdpp_df.to_csv('test_predictions_svdpp.csv')
test_knnb_df.to_csv('test_predictions_knnb.csv')

7


**Surprise model predictions**

In [None]:
# We will load the 1M dataset
data = Dataset.load_builtin('ml-1m')

In [None]:
#Train-Test Split
trainset, testset = train_test_split(data, test_size=.20)

In [None]:
def traintest_conversion_for_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = traintest_conversion_for_surprise(traindf, testdf)

In [None]:
def recommendation(CFalgo, trainset, testset):
  CFalgo.fit(trainset)
  test_predictions = CFalgo.test(testset)
  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)

  return test_rmse, test_mae, test_predictions

In [None]:
#Cross-Validation
#results = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=False)

In [None]:
# KNNBaseline

CFalgo = KNNBaseline()
test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(CFalgo, trainset, testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8702
MAE:  0.6651


In [None]:
# SVD

CFalgo = SVD()
test_svd_rmse, test_svd_mae, test_svd_pred  = recommendation(CFalgo, trainset, testset)

RMSE: 0.8716
MAE:  0.6687


In [None]:
#SVDpp

algo = SVDpp()
test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)

In [None]:
#BaselineOnly()

CFalgo = BaselineOnly()
test_base_rmse, test_base_mae, test_base_pred  = recommendation(CFalgo, trainset, testset)

Estimating biases using als...
RMSE: 0.8667
MAE:  0.6684


In [None]:
test_pred_df = pd.DataFrame(columns= ['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating','svdpp_rating','baseline_rating'])

In [None]:
num_test = len(test_base_pred)
print(num_test)

20168


In [None]:
for i in range(num_test):
  svd = test_svd_pred[i]
  knn = test_knn_pred[i]
  svdpp = test_svdpp_pred[i]
  baseline = test_base_pred[i]
  df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, baseline.est]], columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'baseline_rating'])
  test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)

In [None]:
test_pred_df

Unnamed: 0,uid,iid,og_rating,svd_rating,knn_rating,baseline_rating
0,610,166528,4.0,3.881544,4.011200,3.833082
1,610,163981,3.5,3.603143,3.629444,3.629444
2,610,163937,3.5,3.603143,3.629444,3.629444
3,610,160527,4.5,3.603143,3.629444,3.629444
4,610,160341,2.5,3.603143,3.629444,3.629444
...,...,...,...,...,...,...
20163,1,423,3.0,4.101641,3.658938,3.995889
20164,1,362,5.0,4.490989,4.487416,4.335101
20165,1,316,3.0,4.044845,3.883333,4.057276
20166,1,260,5.0,4.991579,4.976157,4.919510


In [None]:
test_pred_df.to_csv('test_prediction.csv')

**Surprise Model Recommendation**

In [None]:
def traintest_conversion_for_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = traintest_conversion_for_surprise(traindf, testdf)

In [None]:
def get_top_n(predictions, n):
    # Mapping predcitions to each user
    top_n = defaultdict(list)
    original_ratings = defaultdict(list)

    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        original_ratings[uid].append((iid, true_r))

    # Sorting predictions and retriving the highest k
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n, original_ratings

In [None]:
#Calculating Precision, Recall and FMeasure
def precision_recall_at_k(predictions, k=5, threshold=3.5):

    # Mapping predictions to each and every user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    precision = (sum(prec for prec in precisions.values()) / len(precisions))
    recall = (sum(rec for rec in recalls.values()) / len(recalls))

    return precision, recall

In [None]:
def recommendation(CFalgo, trainset, testset):
  start_fit = time.time()
  CFalgo.fit(trainset)
  end_fit = time.time()
  fit_time = end_fit - start_fit

  start_test = time.time()
  test_predictions = CFalgo.test(testset)
  end_test = time.time()
  test_time = end_test - start_test

  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)

  top_n, org_ratings = get_top_n(test_predictions, 5)

  precision, recall = precision_recall_at_k(test_predictions)

  f_measure = (2*precision*recall)/(precision+recall)

  return (test_rmse, test_mae, fit_time, test_time, precision, recall, f_measure, test_predictions)


In [None]:
#Collaborative Filtering Model
similarity = {'name': 'cosine',
               'user_based': False
               }
algo = KNNBaseline(similarity=similarity)

results = recommendation(CFalgo,trainset,testset)
print(results[0])
print(results[1])
print(results[2])
print(results[3])
print(results[4])
print(results[5])
print(results[6])
print(results[7])

Estimating biases using als...
RMSE: 0.8667
MAE:  0.6684
0.8666587911022093
0.6683674330262719
0.32454895973205566
0.1068723201751709
0.8192076502732264
0.4030169062430222
0.5402518400133981
[Prediction(uid=1, iid=1, r_ui=4.0, est=4.639018534870931, details={'was_impossible': False}), Prediction(uid=1, iid=260, r_ui=5.0, est=4.919509704168233, details={'was_impossible': False}), Prediction(uid=1, iid=316, r_ui=3.0, est=4.057276256428144, details={'was_impossible': False}), Prediction(uid=1, iid=362, r_ui=5.0, est=4.3351014561959795, details={'was_impossible': False}), Prediction(uid=1, iid=423, r_ui=3.0, est=3.9958886969232594, details={'was_impossible': False}), Prediction(uid=1, iid=441, r_ui=4.0, est=4.64094729617061, details={'was_impossible': False}), Prediction(uid=1, iid=592, r_ui=4.0, est=4.198133394201726, details={'was_impossible': False}), Prediction(uid=1, iid=804, r_ui=4.0, est=4.061112047631246, details={'was_impossible': False}), Prediction(uid=1, iid=943, r_ui=4.0, est=

In [None]:
CFalgo = CoClustering(2,5,50)

test_rmse, test_mae, test_predictions, fit_time, test_time, precision, recall, f_measure = recommendation(CFalgo,trainset,testset)
print(test_rmse)
print(test_mae)
print(fit_time)
print(test_time)
print(precision)
print(recall)
print(f_measure)

RMSE: 0.9402
MAE:  0.7266
0.9401658030996545
0.7265549058596027
0.10609817504882812
0.7815573770491827
0.3927332797668501
0.5227727739003697
[Prediction(uid=1, iid=1, r_ui=4.0, est=4.890648132015216, details={'was_impossible': False}), Prediction(uid=1, iid=260, r_ui=5.0, est=5, details={'was_impossible': False}), Prediction(uid=1, iid=316, r_ui=3.0, est=4.271873127301098, details={'was_impossible': False}), Prediction(uid=1, iid=362, r_ui=5.0, est=4.694487250949471, details={'was_impossible': False}), Prediction(uid=1, iid=423, r_ui=3.0, est=3.522408223082966, details={'was_impossible': False}), Prediction(uid=1, iid=441, r_ui=4.0, est=4.757431264557621, details={'was_impossible': False}), Prediction(uid=1, iid=592, r_ui=4.0, est=4.3921973890266335, details={'was_impossible': False}), Prediction(uid=1, iid=804, r_ui=4.0, est=3.9509796516543947, details={'was_impossible': False}), Prediction(uid=1, iid=943, r_ui=4.0, est=5, details={'was_impossible': False}), Prediction(uid=1, iid=954,

In [None]:
surprise_df = pd.DataFrame(columns= ['Algorithm', 'test_rmse', 'test_mae', 'fit_time', 'test_time', 'Precision', 'Recall', 'F-measure'])

In [None]:
# Here, we iterate iver all the algorithms we have chosen
for algorithm in [SVD(), KNNBaseline(), BaselineOnly(), CoClustering()]:
    results = recommendation(algorithm,trainset,testset)

    name =str(algorithm).split(' ')[0].split('.')[-1]
    print("Algorithm:", name)
    df = pd.DataFrame([[name, results[0], results[1], results[2], results[3], results[4], results[5], results[6]]], columns= ['Algorithm', 'test_rmse', 'test_mae', 'fit_time', 'test_time', 'Precision', 'Recall', 'F-measure'])
    surprise_df = pd.concat([df, surprise_df], ignore_index=True)
surprise_df.sort_values(by='test_rmse', ascending=False)

RMSE: 0.8724
MAE:  0.6685
Algorithm: SVD
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8702
MAE:  0.6651
Algorithm: KNNBaseline
Estimating biases using als...
RMSE: 0.8667
MAE:  0.6684
Algorithm: BaselineOnly
RMSE: 0.9434
MAE:  0.7286
Algorithm: CoClustering


Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure
0,CoClustering,0.943359,0.728634,2.284726,0.109849,0.774235,0.382614,0.512138
3,SVD,0.872406,0.668516,2.042173,0.172292,0.804891,0.39886,0.533397
2,KNNBaseline,0.870163,0.665052,0.360369,1.92659,0.802295,0.418721,0.550259
1,BaselineOnly,0.866659,0.668367,0.240502,0.099157,0.819208,0.403017,0.540252


In [None]:
surprise_df.sort_values(by='test_rmse')

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure
1,BaselineOnly,0.866659,0.668367,0.240502,0.099157,0.819208,0.403017,0.540252
2,KNNBaseline,0.870163,0.665052,0.360369,1.92659,0.802295,0.418721,0.550259
3,SVD,0.872406,0.668516,2.042173,0.172292,0.804891,0.39886,0.533397
0,CoClustering,0.943359,0.728634,2.284726,0.109849,0.774235,0.382614,0.512138


In [None]:
surprise_df.sort_values(by='F-measure', ascending=False)

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure
2,KNNBaseline,0.870163,0.665052,0.360369,1.92659,0.802295,0.418721,0.550259
1,BaselineOnly,0.866659,0.668367,0.240502,0.099157,0.819208,0.403017,0.540252
3,SVD,0.872406,0.668516,2.042173,0.172292,0.804891,0.39886,0.533397
0,CoClustering,0.943359,0.728634,2.284726,0.109849,0.774235,0.382614,0.512138


In [None]:
surprise_df.head()

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure
0,CoClustering,0.943359,0.728634,2.284726,0.109849,0.774235,0.382614,0.512138
1,BaselineOnly,0.866659,0.668367,0.240502,0.099157,0.819208,0.403017,0.540252
2,KNNBaseline,0.870163,0.665052,0.360369,1.92659,0.802295,0.418721,0.550259
3,SVD,0.872406,0.668516,2.042173,0.172292,0.804891,0.39886,0.533397


In [None]:
surprise_df.to_csv('Surprise_results.csv')

In [None]:
surprise_df.sort_values(by='test_rmse')

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure
1,BaselineOnly,0.866659,0.668367,0.240502,0.099157,0.819208,0.403017,0.540252
2,KNNBaseline,0.870163,0.665052,0.360369,1.92659,0.802295,0.418721,0.550259
3,SVD,0.872406,0.668516,2.042173,0.172292,0.804891,0.39886,0.533397
0,CoClustering,0.943359,0.728634,2.284726,0.109849,0.774235,0.382614,0.512138


In [None]:
surprise_df.sort_values(by='F-measure', ascending=False)

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure
2,KNNBaseline,0.870163,0.665052,0.360369,1.92659,0.802295,0.418721,0.550259
1,BaselineOnly,0.866659,0.668367,0.240502,0.099157,0.819208,0.403017,0.540252
3,SVD,0.872406,0.668516,2.042173,0.172292,0.804891,0.39886,0.533397
0,CoClustering,0.943359,0.728634,2.284726,0.109849,0.774235,0.382614,0.512138


**Combined Model**

In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
pred_data = pd.read_csv('test_prediction_HP.csv')
pred_data.head()

Unnamed: 0.1,Unnamed: 0,uid,iid,og_rating,svd_rating,knn_rating
0,0,610,166528,4.0,3.835845,4.0112
1,1,610,163981,3.5,3.595239,3.629444
2,2,610,163937,3.5,3.595239,3.629444
3,3,610,160527,4.5,3.595239,3.629444
4,4,610,160341,2.5,3.595239,3.629444


In [None]:
T = pred_data.shape[0]
print(T)

20168


In [None]:
svd_wt = 0.05
knn_wt = 0.6
svdpp_wt = 0.4
baseline_wt = 0

In [None]:
rmse = ((pred_data.og_rating - pred_data.knn_rating) ** 2).mean() ** .5
print(rmse)
mae = (((pred_data.og_rating - pred_data.knn_rating) ** 2) ** .5).mean()
print(mae)

0.8701632187118834
0.6650521435883117


In [None]:
rmse = ((pred_data.og_rating - pred_data.svdpp_rating) ** 2).mean() ** .5
print(rmse)
mae = (((pred_data.og_rating - pred_data.svdpp_rating) ** 2) ** .5).mean()
print(mae)

In [None]:
sqr_sum = 0
abs_sum = 0

for ind, row in pred_data.iterrows():
  org_r = row['og_rating']
  pred_r = svd_wt*row['svd_rating'] + knn_wt*row['knn_rating']
  diff = np.abs(org_r - pred_r)
  # print(diff)
  abs_sum += diff
  sqr_sum += diff**2

rmse = np.sqrt(sqr_sum/T)
print("RMSE", rmse)
mae = abs_sum/T
print("MAE", mae)

RMSE 1.5047087157024743
MAE 1.3357093053298659


**Hyperparameter Tuning + Matrix Factorization**

In [None]:
import time
import pandas as pd

In [None]:
def traintest_conversion_for_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = traintest_conversion_for_surprise(traindf, testdf)

In [None]:
def recommendation(CFalgo, trainset, testset):
  start_fit = time.time()
  CFalgo.fit(trainset)
  end_fit = time.time()
  fit_time = end_fit - start_fit

  start_test = time.time()
  test_predictions = CFalgo.test(testset)
  end_test = time.time()
  test_time = end_test - start_test

  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)

  return test_rmse, test_mae, test_predictions, fit_time, test_time

In [None]:
CFalgo = BaselineOnly()

test_rmse, test_mae, test_predictions, fit_time, test_time = recommendation(CFalgo,trainset,testset)
print(fit_time)
print(test_time)

Estimating biases using als...
RMSE: 0.8667
MAE:  0.6684
0.46881103515625
0.21196913719177246


In [None]:
# Probabilistic Matrix Factorization

CFalgo = SVD()

CFalgo.fit(trainset)
predictions = CFalgo.test(testset)

accuracy.rmse(predictions)
accuracy.mae(predictions)
print("Done!")

RMSE: 0.8734
MAE:  0.6702
Done!


In [None]:
# SVDpp (an extension of SVD which takes implicit ratings)

CFalgo = SVDpp()

CFalgo.fit(trainset)
predictions = CFalgo.test(testset)

accuracy.rmse(predictions)
accuracy.mae(predictions)
print("Done!")

RMSE: 0.8597
MAE:  0.6591
Done!


In [None]:
# CF Algorithm
CFalgo = KNNBaseline()

CFalgo.fit(trainset)
predictions = CFalgo.test(testset)

accuracy.rmse(predictions)
accuracy.mae(predictions)
print("Done!")

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8702
MAE:  0.6651
Done!


In [None]:
from surprise import NormalPredictor
from surprise.model_selection import GridSearchCV

In [None]:
#Implementing GridSearch for Latent Factor
parameter_grid = {'n_factors':[25,50,100], 'n_epochs': [5, 10, 20], 'lr_all': [0.01, 0.02],
              'reg_all': [0.01,0.02]}
gs = GridSearchCV(SVD, parameter_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Best RMSE
print(gs.best_score['rmse'])
# Parameters which gave best RMSE
print(gs.best_params['rmse'])

0.8712191987406065
{'n_factors': 50, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.02}


In [None]:
# best RMSE score
print(gs.best_score['mae'])

0.6809274299789226


**Cold-Start Analysis**

In [None]:
def traintest_conversion_for_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = traintest_conversion_for_surprise(traindf, testdf)

In [None]:
traindf.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,3,4.0,964981247,"['Comedy', 'Romance']",[]
1,1,6,4.0,964982224,"['Action', 'Crime', 'Thriller']",[]
2,1,47,5.0,964983815,"['Mystery', 'Thriller']",[]
3,1,50,5.0,964982931,"['Crime', 'Mystery', 'Thriller']",[]
4,1,70,3.0,964982400,"['Action', 'Comedy', 'Horror', 'Thriller']",[]


In [None]:
CFalgo_svd = SVD()
CFalgo_svdpp = SVDpp()
CFalgo_knn = KNNBaseline()

CFalgo_svd.fit(trainset)
predictions_svd = CFalgo_svd.test(testset)

CFalgo_svdpp.fit(trainset)
predictions_svdpp = CFalgo_svdpp.test(testset)

CFalgo_knn.fit(trainset)
predictions_knn = CFalgo_knn.test(testset)

dump.dump('./dump_SVD', predictions_svd, CFalgo_svd)
dump.dump('./dump_SVDpp', predictions_svdpp, CFalgo_svdpp)
dump.dump('./dump_KNN', predictions_knn, CFalgo_knn)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details'])

In [None]:
similarity = {'name': 'cosine',
               'user_based': False
               }
CFalgo_knnbaseline = KNNBaseline(similarity=similarity)
CFalgo_knnbaseline.fit(trainset)
predictions_knnbaseline = CFalgo_knnbaseline.test(testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
df_knnbaseline = pd.DataFrame(predictions_knnbaseline, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_knnbaseline['err'] = abs(df_knnbaseline.est - df_knnbaseline.rui)
df_knnbaseline['sqr_err'] = (df_knnbaseline.est - df_knnbaseline.rui)**2

In [None]:
df_svd['err'] = abs(df_svd.est - df_svd.rui)
df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)
df_knn['err'] = abs(df_knn.est - df_knn.rui)

In [None]:
df_svd['sqr_err'] = (df_svd.est - df_svd.rui)**2
df_svdpp['sqr_err'] = (df_svdpp.est - df_svdpp.rui)**2
df_knn['sqr_err'] = (df_knn.est - df_knn.rui)**2

In [None]:
CFalgo_baselineonly = BaselineOnly()
CFalgo_baselineonly.fit(trainset)
predictions_baselineonly = CFalgo_baselineonly.test(testset)

df_baselineonly = pd.DataFrame(predictions_baselineonly, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_baselineonly['err'] = abs(df_baselineonly.est - df_baselineonly.rui)
df_baselineonly['sqr_err'] = (df_baselineonly.est - df_baselineonly.rui)**2
#df_baselineonly['Iu'] = df_baselineonly.uid.apply(get_Iu)

Estimating biases using als...


In [None]:
similarity = {'name': 'cosine',
               'user_based': True  # compute  similarities between items
               }
CFalgo_knnbaseline_user = KNNBaseline(similarity=similarity)
CFalgo_knnbaseline_user.fit(trainset)
predictions_knnbaseline_user = CFalgo_knnbaseline_user.test(testset)

df_knn_user = pd.DataFrame(predictions_knnbaseline_user, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_knn_user['err'] = abs(df_knn_user.est - df_knn_user.rui)
df_knn_user['sqr_err'] = (df_knn_user.est - df_knn_user.rui)**2
#df_knn_user['Iu'] = df_knn_user.uid.apply(get_Iu)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
df_svd.head()

Unnamed: 0,uid,iid,rui,est,details,err,sqr_err
0,1,1,4.0,4.728007,{'was_impossible': False},0.728007,0.529994
1,1,260,5.0,5.0,{'was_impossible': False},0.0,0.0
2,1,316,3.0,4.175096,{'was_impossible': False},1.175096,1.38085
3,1,362,5.0,4.481851,{'was_impossible': False},0.518149,0.268479
4,1,423,3.0,4.088422,{'was_impossible': False},1.088422,1.184663


In [None]:
def get_Iu(uid):
    try:
        return traindf[traindf['userId'] == uid].shape[0]
    except ValueError:  # user was not part of the trainset
        return 0

In [None]:
df_knn['Iu'] = df_knn.uid.apply(get_Iu)
df_svd['Iu'] = df_svd.uid.apply(get_Iu)
df_svdpp['Iu'] = df_svdpp.uid.apply(get_Iu)
df_knnbaseline['Iu'] = df_knnbaseline.uid.apply(get_Iu)

In [None]:
print("--------------------------MAE-----------------------")
print("KNN Basic                 ",df_knn[df_knn.Iu < 18].err.mean())
print("SVD                       ", df_svd[df_svd.Iu < 18].err.mean())
print("SVDpp                     ",  df_svdpp[df_svdpp.Iu < 18].err.mean())
print("KNN Baseline (item-item)  ", df_knnbaseline[df_knnbaseline.Iu < 18].err.mean())
#print("BaselineOnly              ",df_baselineonly[df_baselineonly.Iu < 18].err.mean() )
#print("KNN Baseline (user-user)  ",df_knn_user[df_knn_user.Iu < 18].err.mean() )

--------------------------MAE-----------------------
KNN Basic                  0.875737130145179
SVD                        0.7392264827404429
SVDpp                      0.7540791923644738
KNN Baseline (item-item)   0.7705034000026011


In [None]:
print("--------------------------RMSE-----------------------")
print("KNN Basic                ",df_knn[df_knn.Iu < 18].sqr_err.mean()** .5)
print("SVD                      ", df_svd[df_svd.Iu < 18].sqr_err.mean()** .5)
print("SVDpp                    ",  df_svdpp[df_svdpp.Iu < 18].sqr_err.mean()** .5)
print("KNN Baseline (item-item) ", df_knnbaseline[df_knnbaseline.Iu < 18].sqr_err.mean()** .5)
#print("BaselineOnly             ",df_baselineonly[df_baselineonly.Iu < 18].sqr_err.mean()** .5 )
#print("KNN Baseline (user-user) ",df_knn_user[df_knn_user.Iu < 18].sqr_err.mean()** .5)

--------------------------RMSE-----------------------
KNN Basic                 1.0798672461989702
SVD                       0.9486578163190558
SVDpp                     0.9628796496994164
KNN Baseline (item-item)  0.9694807089495964


In [None]:
print("--------------------------MAE-----------------------")
print("KNN Basic                 ",df_knn[df_knn.Iu > 1000].err.mean())
print("SVD                       ", df_svd[df_svd.Iu > 1000].err.mean())
print("SVDpp                     ",  df_svdpp[df_svdpp.Iu > 1000].err.mean())
print("KNN Baseline (item-item)  ", df_knnbaseline[df_knnbaseline.Iu > 1000].err.mean())
#print("BaselineOnly              ",df_baselineonly[df_baselineonly.Iu > 1000].err.mean() )
#print("KNN Baseline (user-user)  ",df_knn_user[df_knn_user.Iu > 1000].err.mean() )

--------------------------MAE-----------------------
KNN Basic                  0.6830666941398182
SVD                        0.6175435254272961
SVDpp                      0.6114047306281747
KNN Baseline (item-item)   0.6086864976697254


In [None]:
iid_df = traindf.groupby(['userId'],as_index=False).movieId.count()
iid_df.movieId.max()

2158

**Popularity**

In [None]:
movies = pd.read_csv("/content/tmdb_5000_movies.csv")

In [None]:
movies.sort_values(by='popularity', ascending=False)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
546,74000000,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 16, ""...",http://www.minionsmovie.com/,211672,"[{""id"": 3487, ""name"": ""assistant""}, {""id"": 179...",en,Minions,"Minions Stuart, Kevin and Bob are recruited by...",875.581305,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2015-06-17,1156730962,91.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Before Gru, they had a history of bad bosses",Minions,6.4,4571
95,165000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 18, ""...",http://www.interstellarmovie.net/,157336,"[{""id"": 83, ""name"": ""saving the world""}, {""id""...",en,Interstellar,Interstellar chronicles the adventures of a gr...,724.247784,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...","[{""iso_3166_1"": ""CA"", ""name"": ""Canada""}, {""iso...",2014-11-05,675120017,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Mankind was born on Earth. It was never meant ...,Interstellar,8.1,10867
788,58000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.foxmovies.com/movies/deadpool,293660,"[{""id"": 2095, ""name"": ""anti hero""}, {""id"": 307...",en,Deadpool,Deadpool tells the origin story of former Spec...,514.569956,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2016-02-09,783112979,108.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Witness the beginning of a happy ending,Deadpool,7.4,10995
94,170000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 878, ""na...",http://marvel.com/guardians,118340,"[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": ...",en,Guardians of the Galaxy,"Light years from Earth, 26 years after being a...",481.098624,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2014-07-30,773328629,121.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,All heroes start somewhere.,Guardians of the Galaxy,7.9,9742
127,150000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.madmaxmovie.com/,76341,"[{""id"": 2964, ""name"": ""future""}, {""id"": 3713, ...",en,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,434.278564,"[{""name"": ""Village Roadshow Pictures"", ""id"": 7...","[{""iso_3166_1"": ""AU"", ""name"": ""Australia""}, {""...",2015-05-13,378858340,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,What a Lovely Day.,Mad Max: Fury Road,7.2,9427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,0,"[{""id"": 27, ""name"": ""Horror""}]",,426067,[],en,Midnight Cabaret,A Broadway producer puts on a play with a Devi...,0.001389,[],[],1990-01-01,0,94.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The hot spot where Satan's waitin'.,Midnight Cabaret,0.0,0
4118,0,[],,325140,[],en,Hum To Mohabbat Karega,"Raju, a waiter, is in love with the famous TV ...",0.001186,[],[],2000-05-26,0,0.0,[],Released,,Hum To Mohabbat Karega,0.0,0
4727,0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 18, ""nam...",,65448,"[{""id"": 378, ""name"": ""prison""}, {""id"": 209476,...",en,Penitentiary,A hitchhiker named Martel Gordone gets in a fi...,0.001117,[],[],1979-12-01,0,99.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"There's only one way out, and 100 fools stand ...",Penitentiary,4.9,8
3361,0,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 28, ""nam...",,77156,[],en,Alien Zone,A man who is having an affair with a married w...,0.000372,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1978-11-22,0,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Don't you dare go in there!,Alien Zone,4.0,3


In [None]:
genres = {'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [None]:
def genre_based_on_popularity(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False)
    return filtered_movie

In [None]:
genre_based_on_popularity('Animation')[['title', 'popularity']].head(25)

Unnamed: 0,title,popularity
546,Minions,875.581305
88,Big Hero 6,203.73459
124,Frozen,165.125366
506,Despicable Me 2,136.886704
77,Inside Out,128.655964
55,Brave,125.114374
2294,Spirited Away,118.968562
614,Despicable Me,113.858273
231,"Monsters, Inc.",106.815545
160,How to Train Your Dragon 2,100.21391


In [None]:
genre_based_on_popularity('Romance')[['title', 'popularity']].head(25)

Unnamed: 0,title,popularity
809,Forrest Gump,138.133331
1337,Twilight,127.084938
81,Maleficent,110.620647
612,The Twilight Saga: Eclipse,107.069763
326,Cinderella,101.187052
25,Titanic,100.025899
172,The Twilight Saga: Breaking Dawn - Part 2,99.687084
1154,Fifty Shades of Grey,98.755657
898,The Twilight Saga: New Moon,94.815867
1695,Aladdin,92.982009


In [None]:
genre_based_on_popularity('Action')[['title', 'popularity']].head(25)

Unnamed: 0,title,popularity
788,Deadpool,514.569956
94,Guardians of the Galaxy,481.098624
127,Mad Max: Fury Road,434.278564
28,Jurassic World,418.708552
199,Pirates of the Caribbean: The Curse of the Bla...,271.972889
82,Dawn of the Planet of the Apes,243.791743
88,Big Hero 6,203.73459
108,Terminator Genisys,202.042635
26,Captain America: Civil War,198.372395
65,The Dark Knight,187.322927


In [None]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int') #V

vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int') #R

C = vote_averages.mean()
C

5.6529252550489275

In [None]:
m = vote_counts.quantile(0.95)
m

3040.8999999999996

In [None]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
movies['wr'] = movies.apply(weighted_rating, axis=1)

In [None]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,wr
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,6.883004
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,6.397112
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,6.037883
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,7.112562
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,5.836779


In [None]:
def genre_based_popularity_PT(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='wr', ascending=False)
    return filtered_movie

In [None]:
genre_based_popularity_PT('Animation')[['title', 'wr', 'popularity']].head(10)

Unnamed: 0,title,wr,popularity
77,Inside Out,7.256609,128.655964
494,The Lion King,7.152037,90.457886
2294,Spirited Away,7.130169,118.968562
57,WALL·E,7.100727,66.390712
88,Big Hero 6,7.088458,203.73459
66,Up,7.071909,92.201962
328,Finding Nemo,6.953823,85.688789
1541,Toy Story,6.9509,73.640445
231,"Monsters, Inc.",6.878463,106.815545
42,Toy Story 3,6.824805,59.995418


In [None]:
genre_based_popularity_PT('Action')[['title', 'wr', 'popularity']].head(25)

Unnamed: 0,title,wr,popularity
65,The Dark Knight,7.685113,187.322927
96,Inception,7.656878,167.58371
329,The Lord of the Rings: The Return of the King,7.429908,123.630332
262,The Lord of the Rings: The Fellowship of the Ring,7.392365,138.049577
94,Guardians of the Galaxy,7.365448,481.098624
1990,The Empire Strikes Back,7.331672,78.51783
2912,Star Wars,7.330069,126.393695
634,The Matrix,7.328089,104.309993
330,The Lord of the Rings: The Two Towers,7.322066,106.914973
571,Inglourious Basterds,7.178513,72.595961


In [None]:
genre_based_popularity_PT('Romance')[['title', 'wr', 'popularity']].head(25)

Unnamed: 0,title,wr,popularity
809,Forrest Gump,7.493812,138.133331
25,Titanic,6.970261,100.025899
1997,Her,6.942697,53.682367
2152,Eternal Sunshine of the Spotless Mind,6.879048,56.481487
2547,The Theory of Everything,6.772112,61.182331
1260,Amélie,6.77195,73.720244
2838,The Fault in Our Stars,6.729273,74.358971
1559,The Notebook,6.680836,55.109138
493,A Beautiful Mind,6.671066,59.248437
2776,The Perks of Being a Wallflower,6.664045,43.444135


**Hybrid Model**

In [None]:
def traintest_conversion_for_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = traintest_conversion_for_surprise(traindf, testdf)

In [None]:
testdf.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"['Adventure', 'Animation', 'Children', 'Comedy...",[]
1,1,260,5.0,964981680,"['Action', 'Adventure', 'Sci-Fi']",[]
2,1,316,3.0,964982310,"['Action', 'Adventure', 'Sci-Fi']",[]
3,1,362,5.0,964982588,"['Adventure', 'Children', 'Romance']",[]
4,1,423,3.0,964982363,"['Action', 'Thriller']",[]


In [None]:
similarity = {'name': 'cosine',
               'user_based': False
               }
knnbaseline_algo = KNNBaseline(similarity=similarity)

knnbaseline_algo.fit(trainset)
knnbaseline_predictions = knnbaseline_algo.test(testset)

file_name = 'KnnBaseline_model'
dump.dump(file_name, algo=knnbaseline_predictions)

accuracy.rmse(knnbaseline_predictions)
accuracy.mae(knnbaseline_predictions)
print("Done!")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8951
MAE:  0.6897
Done!


In [None]:
svd_algo = SVD()

svd_algo.fit(trainset)
svd_predictions = svd_algo.test(testset)

file_name = 'svd_model'
dump.dump(file_name, algo=svd_algo)

accuracy.rmse(svd_predictions)
accuracy.mae(svd_predictions)
print("Done!")

RMSE: 0.8723
MAE:  0.6688
Done!


In [None]:
svdpp_algo = SVDpp()

svdpp_algo.fit(trainset)
svdpp_predictions = svdpp_algo.test(testset)

file_name = 'svd_model'
dump.dump(file_name, algo=svdpp_algo)

accuracy.rmse(svdpp_predictions)
accuracy.mae(svdpp_predictions)
print("Done!")

RMSE: 0.8606
MAE:  0.6592
Done!


In [None]:
knn_baseline = dump.load('KnnBaseline_model')
svdpp = dump.load('svd_model')

In [None]:
# Users in testing data represented as a list
user_list = testdf['userId'].unique()

In [None]:
test_movies = testdf[testdf['userId'] == 60]
test_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
1745,60,48,3.0,1393541734,"['Animation', 'Children', 'Drama', 'Musical', ...",[]
1746,60,527,5.0,1393542064,"['Drama', 'War']",[]
1747,60,1242,4.0,1393541757,"['Drama', 'War']",[]
1748,60,2067,3.0,1393541982,"['Drama', 'Romance', 'War']",[]


In [None]:
movies = pd.read_csv("movie_vector.csv")

In [None]:
genre_to_idx = {'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [None]:
idx_to_genre = {0: 'Adventure',
 1: 'Animation',
 2: 'Children',
 3: 'Comedy',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'Mystery',
 12: 'Sci-Fi',
 13: 'War',
 14: 'Musical',
 15: 'Documentary',
 16: 'IMAX',
 17: 'Western',
 18: 'Film-Noir',
 19: '(no genres listed)'}

In [None]:
movies.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,movie_vector
0,0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy...",[1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
1,1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']",[1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
2,2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']",[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
3,3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']",[0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. ...
4,4,5,Father of the Bride Part II (1995),['Comedy'],[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


In [None]:
tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix_new = tf_new.fit_transform(movies['genres'])

In [None]:
cosine_sim_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)

In [None]:
movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
indices.head(2)

title
Toy Story (1995)    0
Jumanji (1995)      1
dtype: int64

In [None]:
def get_recommendations_new(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices]

In [None]:
def genre_based_on_popularity(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False)
    return filtered_movie['movieId'].head(10).values.tolist()


In [None]:
user_info = pd.read_csv('user_info.csv')

In [None]:
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: np.asarray(x).astype(float))

In [None]:
def user_top_genre(userId):
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list

In [None]:
user_top_genre(1)

User Vector:  [4.36764706 4.76       4.6        4.3030303  4.29411765 4.26315789
 4.53571429 4.34782609 4.36585366 4.125      3.35714286 4.17647059
 4.25925926 4.73333333 4.75       0.         0.         4.5
 5.         0.        ]


['Film-Noir', 'Animation', 'Musical']

In [None]:
knn_baseline = dump.load('KnnBaseline_model')
svdpp = dump.load('svd_model')

In [None]:
user_list = testdf['userId'].unique()

In [None]:
test_movies = testdf[testdf['userId'] == 60]
test_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
1745,60,48,3.0,1393541734,"['Animation', 'Children', 'Drama', 'Musical', ...",[]
1746,60,527,5.0,1393542064,"['Drama', 'War']",[]
1747,60,1242,4.0,1393541757,"['Drama', 'War']",[]
1748,60,2067,3.0,1393541982,"['Drama', 'Romance', 'War']",[]


In [None]:
#Hybrid Model

def hybrid(userId):
    user_movies = testdf[testdf['userId'] == userId]
    user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)
    user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
    user_movies['Model'] = 'SVD + CF'

    recommend_list = user_movies[['movieId', 'est', 'Model']]
    print(recommend_list.head())

    movie_list = recommend_list['movieId'].values.tolist()
    print(movie_list)
    sim_movies_list = []
    for movie_id in movie_list:
        movie_title = movies['title'][movies['movieId'] == movie_id].values[0]
        sim_movies = get_recommendations_new(movie_title)
        sim_movies_list.extend(sim_movies)


    # Ratings for the popular movies are computed
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Movie similarity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)

    return recommend_list

In [None]:
traindf[traindf['userId'] == 524].sort_values(by = 'rating', ascending = False)

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
66067,524,1200,5.0,851609623,"['Action', 'Adventure', 'Horror', 'Sci-Fi']",[]
66072,524,1233,5.0,851609788,"['Action', 'Drama', 'War']",[]
66035,524,589,5.0,851608875,"['Action', 'Sci-Fi']",[]
66021,524,457,5.0,851608781,['Thriller'],[]
66019,524,377,5.0,851608745,"['Action', 'Romance', 'Thriller']",[]
...,...,...,...,...,...,...
66007,524,173,1.0,851609191,"['Action', 'Crime', 'Sci-Fi']",[]
66063,524,1193,1.0,851609665,['Drama'],[]
66010,524,208,1.0,851609297,"['Action', 'Adventure', 'Sci-Fi']",[]
66030,524,544,1.0,851609066,"['Action', 'Crime']",[]


In [None]:
testdf[testdf['userId'] == 574]

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
17795,574,150,4.0,834634383,"['Adventure', 'Drama', 'IMAX']",[]
17796,574,161,4.0,834634464,"['Drama', 'Thriller', 'War']",[]
17797,574,300,3.0,834634504,['Drama'],[]
17798,574,344,5.0,834634408,['Comedy'],[]
17799,574,595,4.0,834634443,"['Animation', 'Children', 'Fantasy', 'Musical'...",[]


In [None]:
movie_ids = hybrid(1)

    movieId       est     Model
22     2028  4.971498  SVD + CF
1       260  4.969774  SVD + CF
11     1196  4.966499  SVD + CF
12     1197  4.963478  SVD + CF
[2028, 260, 1196, 1197]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)


In [None]:
def get_title(x):
    mid = x['movieId']
    return movies['title'][movies['movieId'] == mid].values

In [None]:
def get_genre(x):
    mid = x['movieId']
    return movies['genres'][movies['movieId'] == mid].values

In [None]:
movie_ids['title'] = movie_ids.apply(get_title, axis=1)
movie_ids['genre'] = movie_ids.apply(get_genre, axis=1)

In [None]:
movie_ids.sort_values(by='est', ascending = False).head(10)

Unnamed: 0,movieId,est,Model,title,genre
5,1233,5.0,Movie similarity,"[Boot, Das (Boat, The) (1981)]","[['Action', 'Drama', 'War']]"
0,2028,4.971498,SVD + CF,[Saving Private Ryan (1998)],"[['Action', 'Drama', 'War']]"
7,2028,4.971498,Movie similarity,[Saving Private Ryan (1998)],"[['Action', 'Drama', 'War']]"
24,260,4.969774,Movie similarity,[Star Wars: Episode IV - A New Hope (1977)],"[['Action', 'Adventure', 'Sci-Fi']]"
1,260,4.969774,SVD + CF,[Star Wars: Episode IV - A New Hope (1977)],"[['Action', 'Adventure', 'Sci-Fi']]"
14,260,4.969774,Movie similarity,[Star Wars: Episode IV - A New Hope (1977)],"[['Action', 'Adventure', 'Sci-Fi']]"
28,1210,4.967985,Movie similarity,[Star Wars: Episode VI - Return of the Jedi (1...,"[['Action', 'Adventure', 'Sci-Fi']]"
18,1210,4.967985,Movie similarity,[Star Wars: Episode VI - Return of the Jedi (1...,"[['Action', 'Adventure', 'Sci-Fi']]"
17,1196,4.966499,Movie similarity,[Star Wars: Episode V - The Empire Strikes Bac...,"[['Action', 'Adventure', 'Sci-Fi']]"
2,1196,4.966499,SVD + CF,[Star Wars: Episode V - The Empire Strikes Bac...,"[['Action', 'Adventure', 'Sci-Fi']]"
