In [146]:
# recommender function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from surprise import SVD, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, NMF, NormalPredictor, BaselineOnly, CoClustering
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
pd.options.mode.chained_assignment = None  # default='warn'

In [147]:
movies_original = pd.read_csv('movies.csv')
ratings_original = pd.read_csv('ratings.csv')
links_original = pd.read_csv('links.csv')

In [148]:
reader = Reader()
reader = Reader(rating_scale=(0, 5))
svd = SVD()
KNNBaseline = KNNBaseline()
KNNBasic = KNNBasic()
KNNWithMeans = KNNWithMeans()
KNNWithZScore = KNNWithZScore()
movies = movies_original.copy()
ratings = ratings_original.copy()


In [149]:
def user_based_recommender(user_id, n):
        movies = movies_original.copy()
        ratings = ratings_original.copy()
        
        if len(ratings.loc[ratings['userId']== user_id]):
            # user based approach using cosine similarity
           
            # user item matrix
            users_crosstab_original = ratings.pivot_table(
    index='userId', columns='movieId', values='rating')
            users_crosstab_original.fillna(0, inplace=True)
            users_crosstab = users_crosstab_original.copy()
            # user similarity matrix
            user_similarity = cosine_similarity(users_crosstab)
            # turn similarities to weights
            user_similarity = MinMaxScaler().fit_transform(user_similarity)
            # estimate the missing ratings
            user_predicted_ratings = np.dot(user_similarity, users_crosstab)
            # turn the predicted ratings to a dataframe
            user_predicted_ratings = pd.DataFrame(
                user_predicted_ratings, index=users_crosstab.index, columns=users_crosstab.columns)
            # get the movies that the user has already rated
            user_rated = users_crosstab.loc[user_id, :]
            # get the movies that the user has not rated
            user_not_rated = user_rated[user_rated == 0]
            # get the movies that the user has not rated and sort them by the predicted ratings
            user_predicted_ratings = user_predicted_ratings.loc[user_id, user_not_rated.index]
            user_predicted_ratings = user_predicted_ratings.sort_values(    ascending=False)
            # get the top n movies
            user_predicted_ratings = user_predicted_ratings.head(n)
            # get movie name
            movieslist = movies.loc[movies['movieId'].isin(user_predicted_ratings.index), ['movieId', 'title']]
            
            # similarity = pd.DataFrame(user_similarity, index=users_crosstab.index, columns=users_crosstab.index)
            return movieslist
            
        else:
            
            return "User not found"

In [173]:
def user_base_recommender_using_KNNBaseline(user_id, n):

    # get the ratings that the user has not rated
    ratings_not_rated_by_user = ratings.loc[ratings['userId'] != user_id]

    data = Dataset.load_from_df(
        ratings_not_rated_by_user[['userId', 'movieId', 'rating']], reader)
    # cross_validate(KNNBaseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    trainset = data.build_full_trainset()
    KNNBaseline.fit(trainset)
    # sort the movies upon the highest accuracy of prediction
    ratings_not_rated_by_user['est'] = ratings_not_rated_by_user.apply(
        lambda x: KNNBaseline.predict(x['userId'], x['movieId']).est, axis=1)
    ratings_not_rated_by_user = ratings_not_rated_by_user.sort_values(
        by='est', ascending=False)
    # get the top n movies
    return movies_original.loc[movies_original['movieId'].isin(ratings_not_rated_by_user.head(n)['movieId']), ['movieId', 'title']]


In [174]:
def user_base_recommender_using_svd(user_id, n):
    # find best movie according to svd
    movies = movies_original.copy()
    ratings = ratings_original.copy()
    # get the ratings that the user has not rated
    ratings_not_rated_by_user = ratings.loc[ratings['userId'] != user_id]
    
    data = Dataset.load_from_df(
        ratings_not_rated_by_user[['userId', 'movieId', 'rating']], reader)
        # cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    # sort the movies upon the highest accuracy of prediction 
    ratings_not_rated_by_user['est'] = ratings_not_rated_by_user.apply(
        lambda x: svd.predict(x['userId'], x['movieId']).est, axis=1)
    ratings_not_rated_by_user = ratings_not_rated_by_user.sort_values(
        by='est', ascending=False)
    # get the top n movies
    return movies_original.loc[movies_original['movieId'].isin(ratings_not_rated_by_user.head(n)['movieId']), ['movieId', 'title']]
    

In [175]:
user_base_recommender_using_svd(610, 5)

Unnamed: 0,movieId,title
520,608,Fargo (1996)
659,858,"Godfather, The (1972)"
704,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
1883,2502,Office Space (1999)
7039,68954,Up (2009)


In [176]:
user_base_recommender_using_KNNBaseline(610, 5)


Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,movieId,title
1298,1732,"Big Lebowski, The (1998)"
1917,2542,"Lock, Stock & Two Smoking Barrels (1998)"
1939,2571,"Matrix, The (1999)"
2226,2959,Fight Club (1999)
7828,93022,Miss Nobody (2010)


In [153]:
user_based_recommender(610, 5)

Unnamed: 0,movieId,title
322,364,"Lion King, The (1994)"
896,1193,One Flew Over the Cuckoo's Nest (1975)
1183,1580,Men in Black (a.k.a. MIB) (1997)
1284,1704,Good Will Hunting (1997)
1734,2329,American History X (1998)


In [154]:
# user based recommendation evaluation using surprise
# rating scale is choose upon the ratings for the movies 0 -5
# the reader is used to specify the rating scale of the data
reader = Reader(rating_scale=(0, 5))
# It is important to note that, the method expects the 'rating' column to be the last argument passed to it, so you should always pass the rating column as the last argument.
data = Dataset.load_from_df(
    ratings_original[['userId', 'movieId', 'rating']], reader)

# Use the famous Singular Value Decomposition (SVD) algorithm. This is a collaborative filtering algorithm, which will be used to make recommendations.
algo_svd = SVD()
# use other algorithms
algo_knn = KNNBasic()
algo_knnwithz = KNNWithZScore()
algo_knnwithmeans = KNNWithMeans()
algo_knnbaseline = KNNBaseline()

# Run 5-fold cross-validation and print results.
cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# recommend movies for a user
algo_svd.fit(data.build_full_trainset())
algo_knn.fit(data.build_full_trainset())
algo_knnwithz.fit(data.build_full_trainset())
algo_knnwithmeans.fit(data.build_full_trainset())
algo_knnbaseline.fit(data.build_full_trainset())

algo_svd.predict(1, 302).est


#  if the est value is closer to 5, it means that the algorithm predicts that the user would rate the movie highly, whereas if the est value is closer to 1, it means that the algorithm predicts that the user would rate the movie poorly


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8753  0.8753  0.8687  0.8708  0.8769  0.8734  0.0031  
MAE (testset)     0.6735  0.6718  0.6698  0.6701  0.6720  0.6714  0.0014  
Fit time          1.65    1.43    1.82    1.67    1.64    1.64    0.12    
Test time         0.24    0.48    0.30    0.24    0.36    0.32    0.09    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


4.271535335111293

In [155]:
# function that compares the algorithms and choose the best for the user


def recommend_movies_surprise(user_id, n_movies):
    # compare the algorithms
    top_movies = []
    # choose the best five movies among the top 40
    for movie in n_movies:
        top_movies.append(algo_svd.predict(user_id, movie).est)
        top_movies.append(algo_knn.predict(user_id, movie).est)
        top_movies.append(algo_knnwithz.predict(user_id, movie).est)
        top_movies.append(algo_knnwithmeans.predict(user_id, movie).est)
        top_movies.append(algo_knnbaseline.predict(user_id, movie).est)

    return top_movies


    # choose the most accurate movieId
recommend_movies_surprise(610, top40.movieId.tolist())


[3.556089640584532,
 3.665123623882493,
 3.6312215262555068,
 3.621201470066743,
 3.6925560962627006,
 4.157006844949859,
 3.7557695115122924,
 3.7174617956520475,
 3.7448785412496854,
 3.80758871087271,
 3.7465101621602552,
 3.9641571272223453,
 3.7653469829692154,
 3.8035680734768036,
 3.939101344578736,
 3.802560788060825,
 3.8346325609808534,
 3.76918474032462,
 3.738259547360352,
 3.801693843163667,
 4.129206674846923,
 3.8897735064722543,
 3.859868873510169,
 3.8700274368811747,
 3.947475854143009,
 3.5966599583264798,
 3.5010618802472995,
 3.4446564349830076,
 3.422163477354076,
 3.505295777260116,
 3.9476334729625346,
 4.197465552202121,
 4.151171263103374,
 4.085703831900589,
 4.18873214743467,
 3.5444869964862136,
 3.63628787527841,
 3.5463648568289625,
 3.554025551276136,
 3.6227342727983776,
 3.987513410207303,
 4.138374544787008,
 4.078274930552186,
 4.025751603497499,
 4.1230751183003544,
 3.7821595366686376,
 3.8831259849995723,
 3.687847226442862,
 3.7137284178999503,
 