In [1]:
import pandas as pd
import numpy as np
import surprise
from surprise import Reader, Dataset, SVD, SlopeOne, KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import accuracy
from sklearn.model_selection import train_test_split
from collections import defaultdict

## EDA 

In [21]:
movie_df = pd.read_csv('/Users/darshan/Desktop/MDM_Project/data/movie.csv', sep=',')

In [22]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [23]:
rating_df = df = pd.read_csv('/Users/darshan/Desktop/MDM_Project/rating.csv', sep=',', usecols=['userId', 'movieId', 'rating','timestamp'])
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [24]:
ratings_group = rating_df['rating'].value_counts().rename_axis('unique_ratings').reset_index(name='size')
print (ratings_group)

   unique_ratings     size
0             4.0  5561926
1             3.0  4291193
2             5.0  2898660
3             3.5  2200156
4             4.5  1534824
5             2.0  1430997
6             2.5   883398
7             1.0   680732
8             1.5   279252
9             0.5   239125


In [25]:
import plotly.graph_objs as go


# Create a bar graph using Plotly
fig = go.Figure(
    data=[go.Bar(x = ratings_group['unique_ratings'], y = ratings_group['size'], text=ratings_group['size'], textposition='auto')],
    layout=go.Layout(
        title='Movie Ratings Distribution',
        xaxis=dict(title='Ratings'),
        yaxis=dict(title='Count')
    )
)

# Set the text font and size for the bar labels
fig.update_traces(texttemplate='%{text:.2s}', textfont=dict(size=12))

# Show the graph
fig.show()

### Total Number of Unique Movies in the Dataset

In [26]:
rating_df['movieId'].nunique()

26744

### Total Number of Users who provided ratings in the Dataset

In [27]:
rating_df['userId'].nunique()

138493

In [28]:
top_15_most_rated = ratings.movieId.value_counts()[:10]
print('MovieID\tNo.of ratings')
top_15_most_rated

MovieID	No.of ratings


296     3320
356     3314
593     3080
318     3061
480     2979
260     2729
110     2667
589     2572
2571    2567
527     2475
Name: movieId, dtype: int64

In [29]:
top_15_most_rated = ratings.movieId.value_counts()[:10]
top_15_most_rated_df = pd.DataFrame({'movieId': top_15_most_rated.index, 'No.of ratings': top_15_most_rated.values})
print(top_15_most_rated_df)

   movieId  No.of ratings
0      296           3320
1      356           3314
2      593           3080
3      318           3061
4      480           2979
5      260           2729
6      110           2667
7      589           2572
8     2571           2567
9      527           2475


### Top 10 movies with highest number of ratings in the Dataset

In [30]:
top_10_movies = pd.merge(movie_df, top_15_most_rated_df, on="movieId")
top_10_movies.sort_values('No.of ratings', ascending=False)

Unnamed: 0,movieId,title,genres,No.of ratings
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,3320
4,356,Forrest Gump (1994),Comedy|Drama|Romance|War,3314
8,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,3080
3,318,"Shawshank Redemption, The (1994)",Crime|Drama,3061
5,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,2979
1,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,2729
0,110,Braveheart (1995),Action|Drama|War,2667
7,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,2572
9,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,2567
6,527,Schindler's List (1993),Drama|War,2475


In [31]:
import plotly.graph_objs as go


# Create a bar graph using Plotly
fig = go.Figure(
    data=[go.Bar(x = top_10_movies['title'], y = top_10_movies['No.of ratings'], text = top_10_movies['No.of ratings'], textposition='auto')],
    layout=go.Layout(
        title='Top 10 Movies with the Highest Number of User Ratings',
        xaxis=dict(title='Top 10 movies'),
        yaxis=dict(title='No of Ratings')
    )
)


# Set the text font and size for the bar labels
fig.update_traces(texttemplate='%{text:.2s}', textfont=dict(size=12))

# Show the graph
fig.show()

### Average user ratings for each movie in the dataset

In [32]:
mean_movie_ratings = rating_df.groupby('movieId').agg({'rating': [np.size, np.mean]})
mean_movie_ratings.head(25)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,49695,3.92124
2,22243,3.211977
3,12735,3.15104
4,2756,2.861393
5,12161,3.064592
6,23899,3.83493
7,12961,3.366484
8,1415,3.142049
9,3960,3.004924
10,29005,3.430029


### Top 15 movies : Average user ratings for movies with more than 50 user ratings in the dataset

In [33]:
avg_ratings_50 = mean_movie_ratings['rating']['size'] >= 50
movies_50_rating_avg = mean_movie_ratings[avg_ratings_50].sort_values([('rating', 'mean')], ascending=False)[:15]
movies_50_rating_avg

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
318,63366,4.44699
858,41355,4.364732
50,47006,4.334372
527,50054,4.310175
1221,27398,4.275641
2019,11611,4.27418
904,17449,4.271334
7502,4305,4.263182
912,24349,4.258327
922,6525,4.256935


In [34]:
import plotly.express as px
import numpy as np

# group the ratings by user and calculate the mean rating
mean_ratings = ratings.groupby('userId')['rating'].mean()

# create a histogram of the mean ratings using plotly
fig = go.Figure(data=[go.Histogram(x=mean_ratings,
                                   nbinsx=10,
                                   text=np.histogram(mean_ratings, bins=10)[0],
                                   hovertemplate='Rating: %{x}<br>Number of Users: %{text}<extra></extra>')])

fig.update_layout(title='Mean Rating Given by Each User',
                  xaxis_title='Mean Rating',
                  yaxis_title='Number of Users')

# add a legend to the plot
fig.update_traces(name='Mean rating given by each user')

fig.show()

In [35]:
import plotly.express as px



fig = px.histogram(ratings.groupby('movieId').mean(), 
                   x='rating', 
                   nbins=20,
                   labels={'rating': 'Mean rating of each movie',
                           'count': 'Number of movies'})

fig.update_layout(title='Distribution of Mean Ratings for Movies',
                  xaxis_title='Mean Rating',
                  yaxis_title='Number of Movies')

fig.show()


In [36]:
df = pd.read_csv('/Users/darshan/Desktop/MDM_Project/rating.csv', sep=',', usecols=['userId', 'movieId', 'rating','timestamp'])

In [37]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [13]:
ratings = df.iloc[:1000000,:]

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [27]:
ratings.shape

(1000000, 4)

In [31]:
ratings.isnull().values.any()


False

In [32]:
ratings = ratings.dropna()

In [33]:
ratings.shape

(1000000, 4)

In [36]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp     object
dtype: object

In [37]:
ratings['rating'] = ratings['rating'].astype(int)


In [133]:
# Create empty dataframes for the training and test sets
train_set = pd.DataFrame(columns=['userId', 'movieId', 'rating', 'timestamp'])
test_set = pd.DataFrame(columns=['userId', 'movieId', 'rating', 'timestamp'])

# Split the ratings data into training and test sets for each user
for user_id in ratings['userId'].unique():
    # Select the ratings for the current user
    user_ratings = ratings.loc[ratings['userId'] == user_id]
    
    # Split the user's ratings into training and test sets
    train_ratings, test_ratings = train_test_split(user_ratings, test_size=0.2)
    
    # Add the training and test ratings to the corresponding dataframes
    train_set = train_set.append(train_ratings, ignore_index=True)
    test_set = test_set.append(test_ratings, ignore_index=True)

# Create a reader object and load the training data into a dataset
reader = Reader()
train_data = Dataset.load_from_df(train_set[['userId', 'movieId', 'rating']], reader)

# Build a full training set from the training data
trainset = train_data.build_full_trainset()



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [134]:
def recommend_items(predictions):
    """
    Returns a dictionary of recommended items for each user based on the predicted ratings.

    Args:
        predictions: list of (user_id, item_id, true_rating, predicted_rating, additional_info) tuples.

    Returns:
        A dictionary where the keys are user_ids and the values are lists of (item_id, predicted_rating) tuples.
    """
    recommendations = {}

    # Group the predictions by user_id and create a list of (item_id, predicted_rating) tuples for each user
    for user_id, item_id, true_rating, predicted_rating, additional_info in predictions:
        if user_id not in recommendations:
            recommendations[user_id] = []
        recommendations[user_id].append((item_id, predicted_rating))

    # Sort the recommended items for each user by their predicted ratings in descending order
    for user_id, items in recommendations.items():
        items.sort(key=lambda x: x[1], reverse=True)

    # Limit the recommended items to 10 per user
    recommendations = {user_id: items[:10] for user_id, items in recommendations.items()}

    return recommendations


In [135]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """
    Computes precision and recall at k for a list of predictions.

    Args:
        predictions: list of (user_id, item_id, true_rating, predicted_rating, additional_info) tuples.
        k: number of recommended items to consider for computing precision and recall.
        threshold: threshold for considering an item as relevant.

    Returns:
        Two dictionaries: one for precision values and one for recall values, where the keys are user_ids and the 
        values are the corresponding precision and recall values.
    """
    user_predictions = {}

    # Group the predictions by user_id and create a list of (predicted_rating, true_rating) tuples for each user
    for user_id, item_id, true_rating, predicted_rating, additional_info in predictions:
        if user_id not in user_predictions:
            user_predictions[user_id] = []
        user_predictions[user_id].append((predicted_rating, true_rating))

    # Sort the predicted ratings for each user's items in descending order
    for user_id, predictions in user_predictions.items():
        predictions.sort(key=lambda x: x[0], reverse=True)

    # Compute precision and recall for each user
    precisions = {}
    recalls = {}
    for user_id, predictions in user_predictions.items():
        relevant_items = sum((true_r >= threshold) for (_, true_r) in predictions)
        recommended_items = [x[0] for x in predictions[:k]]
        relevant_and_recommended_items = sum((true_r >= threshold) and (pred_r >= threshold)
                                              for pred_r, true_r in predictions[:k])
        precisions[user_id] = relevant_and_recommended_items / k if k != 0 else 1
        recalls[user_id] = relevant_and_recommended_items / relevant_items if relevant_items != 0 else 1

    return precisions, recalls


In [136]:
import numpy as np

def calculate_norm_dcg_score(predictions, k=10, gains="exponential"):
    """
    Calculates the normalized discounted cumulative gain (nDCG) for each user in the predictions.

    Args:
        predictions: list of (user_id, item_id, true_rating, predicted_rating, additional_info) tuples.
        k: int, optional (default=10). The maximum number of recommended items per user to consider.
        gains: str, optional (default="exponential"). The gain function to use. Either "exponential" or "linear".

    Returns:
        A dictionary where the keys are user_ids and the values are the normalized DCG scores for each user.
    """
    user_dict = {}
    for user_id, item_id, true_rating, predicted_rating, additional_info in predictions:
        if user_id in user_dict:
            user_dict[user_id].append((predicted_rating, true_rating))
        else:
            user_dict[user_id] = [(predicted_rating, true_rating)]

    norm_dcg_scores = {}
    for user_id, items in user_dict.items():
        # Sort the items for the user by predicted rating in descending order and limit to k items
        items.sort(key=lambda x: x[0], reverse=True)
        items = items[:k]

        # Extract the predicted ratings and true ratings for the top k items
        predicted_ratings, true_ratings = zip(*items)
        predicted_ratings = np.array(predicted_ratings)
        true_ratings = np.array(true_ratings)

        # Calculate the DCG score for the user
        if gains == "exponential":
            gain_val = 2 ** true_ratings - 1
        elif gains == "linear":
            gain_val = true_ratings
        else:
            raise ValueError("Invalid gains option.")
        discounts = np.log2(np.arange(len(true_ratings)) + 2)
        dcg_score = np.sum(gain_val / discounts)

        # Calculate the IDCG score for the user
        if len(true_ratings) < k:
            idcg_len = len(true_ratings)
            idcg_order = np.argsort(true_ratings)[::-1]
            idcg_gain_val = gain_val[idcg_order]
            idcg_discounts = np.log2(np.arange(idcg_len) + 2)
            idcg_score = np.sum(idcg_gain_val / idcg_discounts)
        else:
            idcg_len = k
            idcg_gain_val = np.sort(gain_val)[::-1]
            idcg_discounts = np.log2(np.arange(idcg_len) + 2)
            idcg_score = np.sum(idcg_gain_val / idcg_discounts)

        # Calculate the nDCG score for the user
        if idcg_score == 0:
            norm_dcg_scores[user_id] = 0.0
        else:
            norm_dcg_scores[user_id] = dcg_score / idcg_score

    return norm_dcg_scores


In [44]:
# Define a list of algorithms to evaluate
algos = [SVD(), SlopeOne(), KNNBasic(), KNNWithMeans()]
algo_names = ["SVD", "SlopeOne", "KNNBasic", "KNNWithMeans"]

# Evaluate each algorithm
for i, algo in enumerate(algos):
    print(f"\n Algorithm: {algo_names[i]}")
    
    # Train the algorithm on the training set
    algo.fit(trainset)
    
    # Make predictions on the test set
    predictions = [algo.predict(row.userId, row.movieId, row.rating) for _, row in test_set.iterrows()]
    
    # Calculate evaluation metrics
    rmse = accuracy.rmse(predictions, verbose=False)
    mae = accuracy.mae(predictions, verbose=False)
    top_recommendations = recommend_items(predictions)
    norm_dcg = norm_dcg_score(predictions, k=10, gains="exponential")
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
    avg_norm_dcg = sum(ndcg for ndcg in norm_dcg.values()) / len(norm_dcg)
    avg_prec = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    fscore = 2 * (avg_prec * avg_recall) / (avg_prec + avg_recall)
    
    # Print evaluation metrics
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"Fscore: {fscore}")
    print(f"Average Precision: {avg_prec}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average Normalized Discounted Cumulative Gain: {avg_norm_dcg}")
    #print(f"Top recommendations: {top_recommendations}")



 Algorithm: SVD
RMSE 0.8699159356668529
MAE 0.6684890617273206
Fscore: 0.4121907030324629
Average Precision: 0.8842626921746012
Average Recall: 0.26872781900628145
Average Normalized Discounted Cumulative Gain: 0.8999717104444959
Top recommendations:  {1: [(318, 4.271105217290782), (5816, 3.9334119329625583), (8368, 3.8893199060169032), (8961, 3.860170693180902), (8482, 3.849752624830232), (1196, 3.8387775545936087), (593, 3.833603549499861), (1291, 3.717435774108142), (6242, 3.686785999196332), (1374, 3.6809710512171123)], 2: [(1270, 4.743650055858979), (1196, 4.682405456880901), (908, 4.4488549681250085), (1580, 4.3306487248850765), (3959, 4.136491390764536), (1544, 4.1232073881299165), (3926, 3.8950812196309337), (3917, 3.7528175363907996), (3, 3.7388512044164717), (3930, 3.4836025559619346)], 3: [(2571, 5), (318, 4.994294856983464), (541, 4.98879813427113), (593, 4.956346487820741), (1221, 4.912261952701813), (1219, 4.754534147925324), (1247, 4.696384576095343), (1228, 4.684066675