In [2]:
import pandas as pd
import numpy as np
from surprise import Reader
from surprise import accuracy
from sklearn.decomposition import NMF
from surprise.prediction_algorithms.random_pred import NormalPredictor
import seaborn as sns
from surprise.model_selection.split import train_test_split
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise.model_selection import cross_validate
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
movies = pd.read_csv("movies.csv",index_col=0)
ratings = pd.read_csv("ratings.csv",index_col=0)

# Reset the index to move movieId to a regular column
movies = movies.reset_index()
ratings = ratings.reset_index()

# Print the first 10 rows of the DataFrame to check the result
ratings.fillna(0)
movies.fillna(0)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_new_user_ratings(movies, ratings, new_ratings):
    """
    Given a set of new ratings for a new user, predicts the rating that the user would give to each movie.

    Parameters:
    movies (pd.DataFrame): DataFrame containing movie data, with columns "movieId", "title", and "genres".
    ratings (pd.DataFrame): DataFrame containing movie ratings data, with columns "userId", "movieId", and "rating".
    new_ratings (list): List of tuples representing the new user's ratings, with each tuple containing the movie ID and the rating.

    Returns:
    pd.DataFrame: DataFrame containing the predicted ratings for the new user, with columns "title" and "predicted_rating".
    """
    # Convert new_ratings to a DataFrame
    new_user_ratings = pd.DataFrame(new_ratings, columns=["movieId", "rating"])
    # Set the user ID to a value that does not exist in the ratings DataFrame
    new_user_ratings["userId"] = ratings["userId"].max() + 1
    # Append new_user_ratings to ratings
    updated_ratings = ratings.append(new_user_ratings)

    # Pivot the ratings DataFrame to get a user-movie matrix
    pivot = updated_ratings.pivot(index="userId", columns="movieId", values="rating")
    normalized_pivot = pivot.apply(lambda x: (x - x.mean()) / x.std(), axis=1)

    # Calculate cosine similarity between users
    cosine_sim = cosine_similarity(normalized_pivot.fillna(0).values)

    # Get the new user's ratings
    new_user_ratings = pivot.iloc[-1].fillna(0)

    # Calculate weighted average of ratings for each movie
    weighted_ratings = np.dot(cosine_sim[-1], normalized_pivot.fillna(0)) / np.sum(cosine_sim[-1])
    predicted_ratings = pd.DataFrame({'movieId': normalized_pivot.columns, 'predicted_rating': weighted_ratings})

    # Merge predicted_ratings with movies DataFrame to get movie titles
    predicted_ratings = predicted_ratings.merge(movies[['movieId', 'title']], on='movieId')

    # Return predicted_ratings DataFrame sorted by predicted rating in descending order
    return predicted_ratings[['title', 'predicted_rating']].sort_values('predicted_rating', ascending=False)


# Example usage:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
new_ratings = [(1, 4.5), (2, 3.0), (3, 2.5), (4, 4.0), (5, 3.5), (6, 5.0), (7, 4.0), (8, 2.0), (9, 4.5), (10, 5.0)]
predictions = predict_new_user_ratings(movies, ratings, new_ratings)
print(predictions)


  updated_ratings = ratings.append(new_user_ratings)


                                                 title  predicted_rating
0                                     Toy Story (1995)          1.157230
9                                     GoldenEye (1995)          1.078134
5                                          Heat (1995)          0.995420
910  Star Wars: Episode VI - Return of the Jedi (1983)          0.401780
899  Raiders of the Lost Ark (Indiana Jones and the...          0.386580
..                                                 ...               ...
84                                 Broken Arrow (1996)         -0.294319
4                   Father of the Bride Part II (1995)         -0.351121
7                                  Tom and Huck (1995)         -0.516481
1                                       Jumanji (1995)         -0.657062
2                              Grumpier Old Men (1995)         -0.767475

[9724 rows x 2 columns]


In [41]:
# replace punctuation in tags (a space), movie name (a space), and genres (no space). These will eventually be folded into the tags list
movies_merged['title'] = movies_merged['title'].str.replace('[^\w\s]',' ')
movies_merged['genres'] = movies_merged['genres'].str.replace('[^\w\s]',' ')
movies_merged['title'] = movies_merged['title'].str.replace('[^\w\s]',' ')
movies_merged['userId'] = movies_merged['userId'].astype(str).str.replace('[^\w\s]','')
movies_merged['rating'] = movies_merged['rating'].astype(str).str.replace('[^\w\s]','')



  movies_merged['title'] = movies_merged['title'].str.replace('[^\w\s]',' ')
  movies_merged['genres'] = movies_merged['genres'].str.replace('[^\w\s]',' ')
  movies_merged['title'] = movies_merged['title'].str.replace('[^\w\s]',' ')
  movies_merged['userId'] = movies_merged['userId'].astype(str).str.replace('[^\w\s]','')
  movies_merged['rating'] = movies_merged['rating'].astype(str).str.replace('[^\w\s]','')


In [29]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji 1995,Adventure Children Fantasy
2,3,Grumpier Old Men 1995,Comedy Romance
3,4,Waiting to Exhale 1995,Comedy Drama Romance
4,5,Father of the Bride Part II 1995,Comedy


In [43]:
pd.set_option('display.max_rows', None)
movies_merged.loc[movies_merged['title'] != movies_merged['title'], ['movieId','rating','title','genres']].head(200)

Unnamed: 0,movieId,rating,title,genres


In [None]:
movies_merged

In [6]:
ratings = ratings.reset_index()


In [42]:

# Extract the year from the title column using a regular expression
movies['year'] = movies['title'].str.extract('\((\d{4})\)', expand=False)

# Print the first 10 rows of the DataFrame to check the result
print(movies.head(10))

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6                         Heat (1995)   
6        7                      Sabrina (1995)   
7        8                 Tom and Huck (1995)   
8        9                 Sudden Death (1995)   
9       10                    GoldenEye (1995)   

                                        genres  year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995  
1                   Adventure|Children|Fantasy  1995  
2                               Comedy|Romance  1995  
3                         Comedy|Drama|Romance  1995  
4                                       Comedy  1995  
5                        Action|Crime|Thriller  1995  
6                               Comedy|Romance  1995  
7        

## PreProcessing

In [5]:
# min_ratings = 2 
# min_ratings_count =  5 
# # Delete movie rated under 2 
# # Delete movie rated less than 5 times
# movies = movies.groupby("movieId").filter(lambda x: x['movieId'].count() >= min_ratings)
# ratings = ratings.groupby("userId").filter(lambda x: x['userId'].count() >= min_ratings_count)


In [6]:
ratings.shape

(100836, 4)

In [7]:
movies.shape

(9742, 4)

In [8]:
#!pip install scikit-surprise

### Train on a whole trainset and the predict() method

In [9]:
reader = Reader(rating_scale=(0.5, 5)) #line_format by default order of the fields

# Surprise Dataset Load method
data = Dataset.load_from_df(ratings[["userId","movieId","rating"]], reader=reader)

trainset = data.build_full_trainset()

testset = trainset.build_anti_testset()


In [10]:
trainset

<surprise.trainset.Trainset at 0x7f7f9902fe50>

### Tune algorithm parameters with GridSearchCV
I use the SVD algorithm from the Surprise library to make predictions on the MovieLens 100K dataset. I define a grid of hyperparameters to search over, and then use GridSearchCV to perform the search. GridSearchCV performs a 3-fold cross-validation for each combination of hyperparameters and computes the RMSE and MAE scores. Finally, it returns the best set of hyperparameters and the corresponding RMSE and MAE scores.

In [11]:
 ## checking the effect of number of latent factors k on the model's performance.
def rmse_vs_factors(algorithm, data):
    """Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  factor k in range(1, 101, 1) 100 values 
    Arg:  i.) algorithm = Matrix factoization algorithm, e.g SVD/NMF/PMF, ii.)  data = surprise.dataset.DatasetAutoFolds
    """
    rmse_algorithm = []
    
    for k in range(1, 101, 1):
        algo = algorithm(n_factors = k)

        #["test_rmse"] is a numpy array with min accuracy value for each testset
        loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
        rmse_algorithm.append(loss_fce)
  
    return rmse_algorithm


In [12]:
#rmse_svd = rmse_vs_factors(SVD,data)
#rmse_svd

In [13]:
def plot_rmse(rmse, algorithm):
    plt.figure(num=None, figsize=(11, 5), dpi=80, facecolor='w', edgecolor='k')
        
    plt.subplot(2,1,1)
    plt.plot(rmse)
    plt.xlim(0,100)
    plt.title("{0} Performance: RMSE Against Number of Factors".format(algorithm), size = 20 )
    plt.ylabel("Mean RMSE (cv=5)")

    plt.subplot(2,1,2)
    plt.plot(rmse)
    plt.xlim(0,50)
    plt.xticks(np.arange(0, 52, step=2))

    plt.xlabel("{0}(n_factor = k)".format(algorithm))
    plt.ylabel("Mean RMSE (cv=5)")
    plt.axvline(np.argmin(rmse), color = "b")


In [14]:
#plot_rmse(rmse_svd,"SVD")

In [15]:
from collections import defaultdict

### Predictions using function 

In [16]:
# Define the testset
testset = data.build_full_trainset().build_testset()
# Train the SVD model
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
# Generate the predictions using the SVD model
predictions = svd.test(testset)


In [18]:
def get_top_n(predictions, userId, movies, ratings, n = 10):
    '''Return the top N (default) movieId for a user,.i.e. userID and history for comparisom
    Args:
    Returns: 
  
    '''
    
    #First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    #Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = user_ratings[: n ]
    
    
    #Tells how many movies the user has already rated
    user_data = ratings[ratings.userId == (userId)]
    print('User {0} has already rated {1} movies.'.format(userId, user_data.shape[0]))

    
    #4. Data Frame with predictions. 
    preds_df = pd.DataFrame([(id, pair[0],pair[1]) for id, row in top_n.items() for pair in row],
                        columns=["userId" ,"movieId","rat_pred"])
    
    
    #Return pred_usr, i.e. top N recommended movies with (merged) titles and genres. 
    pred_usr = preds_df[preds_df["userId"] == (userId)].merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId')
            
    #Return hist_usr, i.e. top N historically rated movies with (merged) titles and genres for holistic evaluation
    old_usr = ratings[ratings.userId == (userId) ].sort_values("rating", ascending = False).merge\
    (movies, how = 'left', left_on = 'movieId', right_on = 'movieId')
    
    
    return old_usr, pred_usr

In [20]:
old_SVD, pred_SVD = get_top_n(predictions, movies = movies, userId = 1, ratings = ratings)

User 1 has already rated 232 movies.


In [21]:
old_SVD.head(15)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War,1970
1,1,2872,5.0,964981680,Excalibur (1981),Adventure|Fantasy,1981
2,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989),Action|Adventure,1989
3,1,1298,5.0,964984086,Pink Floyd: The Wall (1982),Drama|Musical,1982
4,1,2948,5.0,964982191,From Russia with Love (1963),Action|Adventure|Thriller,1963
5,1,2947,5.0,964982176,Goldfinger (1964),Action|Adventure|Thriller,1964
6,1,2944,5.0,964981872,"Dirty Dozen, The (1967)",Action|Drama|War,1967
7,1,2899,5.0,964982703,Gulliver's Travels (1939),Adventure|Animation|Children,1939
8,1,2858,5.0,964980868,American Beauty (1999),Drama|Romance,1999
9,1,2700,5.0,964980985,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical,1999


In [None]:
pred_SVD

In [None]:
plot_rmse(pred_SVD,"SVD")