In [17]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise import KNNBasic
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from tabulate import tabulate
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
ratings_df = pd.read_csv("merged_dataset.csv")
ratings_df.drop(columns=["Unnamed: 0"], inplace=True)
ratings_df.head(1)

  ratings_df = pd.read_csv("merged_dataset.csv")


Unnamed: 0,uid,iid,rating,timestamp,movie_name,date,url,5,6,7,...,18,19,20,21,22,23,age,gender,occupation,zip
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,http://us.imdb.com/M/title-exact?Kolya%20(1996),,,,...,,,,,,,49,M,writer,55105


In [10]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['uid', 'iid', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

LOOCV = LeaveOneOut(n_splits=1, random_state=1)

train_loocv, test_loocv = list(LOOCV.split(data))[0]

In [42]:
from collections import defaultdict

def GetTopN(predictions, n=10, minimumRating=4.0):
  topN = defaultdict(list)

  for userID, movieID, actualRating, estimatedRating, _ in predictions:
    if (estimatedRating >= minimumRating):
        topN[userID].append((movieID, estimatedRating))

  for userID, ratings in topN.items():
    ratings.sort(key=lambda x: x[1], reverse=True)
    topN[userID] = ratings[:n]

  return topN

def HitRate(topNPredicted, leftOutPredictions):
  hits = 0
  total = 0

  # For each left-out rating
  for leftOut in leftOutPredictions:
    userID = leftOut[0]
    leftOutMovieID = leftOut[1]
    # Is it in the predicted top 10 for this user?
    hit = False
    for movieID, predictedRating in topNPredicted[userID]:
      if leftOutMovieID == movieID:
        hit = True
        break
    if (hit) :
      hits += 1

    total += 1

  # Compute overall precision
  return hits/total

def get_hitrate_results(algo, train_loocv, test_loocv):
  algo.fit(train_loocv)
  left_out_predictions = algo.test(test_loocv)
  loocv_anti_testset = train_loocv.build_anti_testset()
  all_predictions = algo.test(loocv_anti_testset)
  top_n_predicted = GetTopN(all_predictions)
  hitrate = HitRate(top_n_predicted, left_out_predictions)
  print(f'HitRate: {hitrate}')
  return all_predictions

def get_algo_results(algo, trainset, testset):
  algo.fit(trainset)
  predictions = algo.test(testset)
  accuracy.rmse(predictions)


def get_most_similar_movies(movies_df, movie_embeddings, trainset, target_movie_id, top_k=10):
    inner_movie_id = trainset.to_inner_iid(target_movie_id)
    sims = cosine_similarity(movie_embeddings, movie_embeddings)
    target_movie_sims_sorted = [trainset.to_raw_iid(x) for x in np.argsort(sims[inner_movie_id])[::-1]]
    most_similar_movies = movies_df.loc[target_movie_sims_sorted].iloc[:top_k]
    return most_similar_movies


def filter_predictions_for_user(predictions, user_id, movies_df, top_k=10):
    top_preds = sorted([pred for pred in predictions if pred.uid == user_id], key=lambda pred: pred.est, reverse=True)[:top_k]
    movie_ids = [pred.iid for pred in top_preds]
    relevant_movies = movies_df.loc[movie_ids]
    relevant_movies['rating'] = [pred.est for pred in top_preds]
    return relevant_movies


def get_algorithm_report(algo_class, trainset, testset, train_loocv, test_loocv, movies_df, target_movie_id=1, target_user_id=1, top_k=10, algo_args=[], algo_kwargs={}, calc_most_similar=True):
    algo_inst = algo_class(*algo_args, **algo_kwargs)
    get_algo_results(algo_inst, trainset, testset)
    algo_inst_for_hitrate = algo_class(*algo_args, **algo_kwargs)
    all_predictions = get_hitrate_results(algo_inst_for_hitrate, train_loocv, test_loocv)
    if calc_most_similar:
        if hasattr(algo_inst_for_hitrate, 'qi'):
            sims = algo_inst_for_hitrate.qi
        else:
            sims = algo_inst_for_hitrate.sim
        most_similar_movies = get_most_similar_movies(movies_df, sims, train_loocv, target_movie_id, top_k=top_k)
        print(most_similar_movies.head(top_k))
        print(f'Most similar movies to {movies_df.loc[target_movie_id].movie_name}:')
        print(tabulate(most_similar_movies.head(top_k)[['movie_name'] + [i for i in range (5,24)]], headers='keys'))
    predictions_for_user = filter_predictions_for_user(all_predictions, target_user_id, movies_df)
    print(f'Top predictions for user {target_user_id}:')
    print(tabulate(predictions_for_user.head(top_k)))
    return predictions_for_user.head(top_k)

class SVDWithTqdm(SVD):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def test(self, testset, verbose=False):
        # The ratings are translated back to their original scale.
        predictions = [self.predict(uid,
                                    iid,
                                    r_ui_trans,
                                    verbose=verbose)
                        for (uid, iid, r_ui_trans) in tqdm(testset, desc='making predictions')]
        return predictions


class KNNBasicWithTqdm(KNNBasic):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def test(self, testset, verbose=False):
        # The ratings are translated back to their original scale.
        predictions = [self.predict(uid,
                                    iid,
                                    r_ui_trans,
                                    verbose=verbose)
                        for (uid, iid, r_ui_trans) in tqdm(testset, desc='making predictions')]
        return predictions



## Random recommender system

In [21]:
normal_predictor = NormalPredictor()
get_algo_results(normal_predictor, trainset, testset)
get_hitrate_results(normal_predictor, train_loocv, test_loocv)    

RMSE: 1.5146
HitRate: 0.03499469777306469


[Prediction(uid=196, iid=302, r_ui=3.5288268370736042, est=3.6555083136589235, details={'was_impossible': False}),
 Prediction(uid=196, iid=222, r_ui=3.5288268370736042, est=3.408075033344083, details={'was_impossible': False}),
 Prediction(uid=196, iid=1, r_ui=3.5288268370736042, est=3.6753720847776497, details={'was_impossible': False}),
 Prediction(uid=196, iid=546, r_ui=3.5288268370736042, est=5, details={'was_impossible': False}),
 Prediction(uid=196, iid=277, r_ui=3.5288268370736042, est=3.340847013557001, details={'was_impossible': False}),
 Prediction(uid=196, iid=246, r_ui=3.5288268370736042, est=3.163226081105145, details={'was_impossible': False}),
 Prediction(uid=196, iid=979, r_ui=3.5288268370736042, est=2.3944864194269524, details={'was_impossible': False}),
 Prediction(uid=196, iid=1137, r_ui=3.5288268370736042, est=2.3510191811406145, details={'was_impossible': False}),
 Prediction(uid=196, iid=100, r_ui=3.5288268370736042, est=3.512894623112314, details={'was_impossibl

## Recommending movies with collaborative filtering

In [43]:
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True})
svg_results = get_algorithm_report(SVDWithTqdm, trainset, testset, train_loocv, test_loocv, 
                                   ratings_df, target_movie_id=242, target_user_id=196, top_k=10, 
                                   calc_most_similar=False)

making predictions:   0%|          | 0/25000 [00:00<?, ?it/s]

RMSE: 0.9375


making predictions:   0%|          | 0/943 [00:00<?, ?it/s]

making predictions:   0%|          | 0/1487069 [00:00<?, ?it/s]

HitRate: 0.03711558854718982
Top predictions for user 196:
---  ---  ---  -------  ---------  -------------------------------------------------------------------  -----------  ----------------------------------------------------------------------------------------------------------------------------  ---  ------  ---------  ---  ---  ------  ---  ---  -----  ---  ---  ---  ---  -------  -------  ------  --------  ---  ---  --  -  ---------  -----
178  226  270  4.68621  883888639  Gattaca (1997)                                                       01-Jan-1997  http://us.imdb.com/M/title-exact?Gattaca+(1997)                                                                               nan  nan     nan        nan  nan  nan     nan  nan  Drama  nan  nan  nan  nan  nan      nan      Sci-Fi  Thriller  nan  nan  28  M  student    92103
496  271  405  4.63334  885848179  Mission: Impossible (1996)                                           22-May-1996  http://us.imdb.com/M/title-exact?Mission

In [94]:
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True})
knn_results = get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv,
                                   ratings_df, target_movie_id=242, target_user_id=196, top_k=10,
                                   calc_most_similar=False)

Computing the msd similarity matrix...
Done computing similarity matrix.


making predictions:   0%|          | 0/25000 [00:00<?, ?it/s]

RMSE: 0.9811
Computing the msd similarity matrix...
Done computing similarity matrix.


making predictions:   0%|          | 0/943 [00:00<?, ?it/s]

making predictions:   0%|          | 0/1487069 [00:00<?, ?it/s]

HitRate: 0.0
Top predictions for user 196:
----  ---  ---  -  ---------  ------------------------------  -----------  -------------------------------------------------------------------------  ---  ------  ---------  ---  ---  ------  ---  ---  -----  ---  ---  ---  ---  ---  ---  ------  --------  ---  ---  --  -  ----------  -----
1189  354  174  5  891218068  Raiders of the Lost Ark (1981)  01-Jan-1981  http://us.imdb.com/M/title-exact?Raiders%20of%20the%20Lost%20Ark%20(1981)  nan  Action  Adventure  nan  nan  nan     nan  nan  nan    nan  nan  nan  nan  nan  nan  nan     nan       nan  nan  29  F  librarian   48197
1201  354   79  5  891217274  Fugitive, The (1993)            01-Jan-1993  http://us.imdb.com/M/title-exact?Fugitive,%20The%20(1993)                  nan  Action  nan        nan  nan  nan     nan  nan  nan    nan  nan  nan  nan  nan  nan  nan     Thriller  nan  nan  29  F  librarian   48197
 814  201    7  5  884112201  Twelve Monkeys (1995)           01-Jan-1995  http:/

In [87]:
def combine_genres(df):
    for i in range(5,24):
        df[str(i)] = pd.Series(df[str(i)], dtype="string")
        df = df.fillna('') 
    df['genres'] = df.loc[:, [str(i) for i in range(5,24)]].agg(' '.join, axis=1)
    return df

In [92]:
print("Initial movie:")
init_movie = combine_genres(ratings_df[ratings_df.loc[:,"iid"] == 242][ratings_df.loc[:,"uid"] == 196])
init_movie.head().loc[:, ["movie_name", "rating", "genres"]]

Initial movie:


  init_movie = combine_genres(ratings_df[ratings_df.loc[:,"iid"] == 242][ratings_df.loc[:,"uid"] == 196])


Unnamed: 0,movie_name,rating,genres
0,Kolya (1996),3,Comedy


In [96]:
print("SVG recommendations:")

rec_movies = combine_genres(svg_results)
rec_movies.head(10).loc[:, ["movie_name", "rating", "genres"]]

SVG recommendations:


Unnamed: 0,movie_name,rating,genres
178,Gattaca (1997),4.686208,Drama Sci-Fi Thriller
496,Mission: Impossible (1996),4.633341,Action Adventure Mystery
134,"Silence of the Lambs, The (1991)",4.53045,Drama Thriller
657,"Piano, The (1993)",4.508115,Drama Romance
648,"Englishman Who Went Up a Hill, But Came Down a...",4.504572,Comedy Romance
98,Tin Cup (1996),4.500843,Comedy Romance
197,Contact (1997),4.499752,Drama Sci-Fi
483,"Quiet Man, The (1952)",4.49104,Comedy Romance
480,It's a Wonderful Life (1946),4.485508,Drama
513,Mars Attacks! (1996),4.481071,Action Comedy Sci-Fi War


In [97]:
print("KNN recommendations:")
rec_movies = combine_genres(knn_results)
rec_movies.head(10).loc[:, ["movie_name", "rating", "genres"]]

KNN recommendations:


Unnamed: 0,movie_name,rating,genres
1189,Raiders of the Lost Ark (1981),5,Action Adventure
1201,"Fugitive, The (1993)",5,Action Thriller
814,Twelve Monkeys (1995),5,Drama Sci-Fi
1500,Get Shorty (1995),5,Action Comedy Drama
1653,"Clockwork Orange, A (1971)",5,Sci-Fi
1467,Air Force One (1997),5,Action Thriller
1599,"Fifth Element, The (1997)",5,Action Sci-Fi
1536,"Rock, The (1996)",5,Action Adventure Thriller
1122,"Postman, The (1997)",5,Drama
1450,Ulee's Gold (1997),5,Drama
