In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import operator

### load ratings into dataframe

In [2]:
ratings_df = pd.read_csv("ContentBasedRecommenderSystem/ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### load movies into dataframe

In [3]:
movies_df = pd.read_csv("ContentBasedRecommenderSystem/movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### merge the two dataframes

In [4]:
# merge on "movieId"
df = pd.merge(ratings_df, movies_df, on="movieId", how="inner")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


### subset 200 users and 200 movies

In [6]:
# subset the first 200 user and first 200 movie
mask1 = df["userId"].isin(ratings_df.userId.unique()[:223])
mask2 = df["movieId"].isin(movies_df.movieId.unique()[:223])
df_200 = df[mask1 & mask2]
df_200.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2517 entries, 0 to 93604
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     2517 non-null   int64  
 1   movieId    2517 non-null   int64  
 2   rating     2517 non-null   float64
 3   timestamp  2517 non-null   int64  
 4   title      2517 non-null   object 
 5   genres     2517 non-null   object 
dtypes: float64(1), int64(3), object(2)
memory usage: 137.6+ KB


### make matrix for users against their ratings to movies

In [7]:
# transform the df to matrix foramt
matrix = df_200.pivot_table(index="userId", columns="title", values="rating")
matrix.head()

title,Ace Ventura: When Nature Calls (1995),"Amazing Panda Adventure, The (1995)","American President, The (1995)",Angels and Insects (1995),Anne Frank Remembered (1995),Antonia's Line (Antonia) (1995),Apollo 13 (1995),Assassins (1995),"Awfully Big Adventure, An (1995)",Babe (1995),...,Up Close and Personal (1996),"Usual Suspects, The (1995)",Vampire in Brooklyn (1995),Waiting to Exhale (1995),"Walk in the Clouds, A (1995)",Waterworld (1995),When Night Is Falling (1995),White Man's Burden (1995),White Squall (1996),Wild Bill (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,5.0,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,3.0,,,4.0,...,,4.0,,,,,,,,
6,2.0,4.0,4.0,,,,4.0,,,4.0,...,3.0,1.0,4.0,3.0,4.0,3.0,,3.0,5.0,4.0


In [8]:
# make sure matrix is 200*200
matrix.shape

(200, 200)

### fill null values in the matrix with the mean rating of a movie

In [9]:
matrix_filled = matrix.fillna(0)
matrix_filled.head()
#matrix.mean()

title,Ace Ventura: When Nature Calls (1995),"Amazing Panda Adventure, The (1995)","American President, The (1995)",Angels and Insects (1995),Anne Frank Remembered (1995),Antonia's Line (Antonia) (1995),Apollo 13 (1995),Assassins (1995),"Awfully Big Adventure, An (1995)",Babe (1995),...,Up Close and Personal (1996),"Usual Suspects, The (1995)",Vampire in Brooklyn (1995),Waiting to Exhale (1995),"Walk in the Clouds, A (1995)",Waterworld (1995),When Night Is Falling (1995),White Man's Burden (1995),White Squall (1996),Wild Bill (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,4.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,...,3.0,1.0,4.0,3.0,4.0,3.0,0.0,3.0,5.0,4.0


### 1. calculate the similarity matrix using cosine_similarity function from scikit-learn

In [10]:
cosine_similarity_matrix  = cosine_similarity(matrix_filled.T)
print(cosine_similarity_matrix.shape)
pd.DataFrame(cosine_similarity_matrix, columns=matrix.columns).set_index(matrix.columns)

(200, 200)


title,Ace Ventura: When Nature Calls (1995),"Amazing Panda Adventure, The (1995)","American President, The (1995)",Angels and Insects (1995),Anne Frank Remembered (1995),Antonia's Line (Antonia) (1995),Apollo 13 (1995),Assassins (1995),"Awfully Big Adventure, An (1995)",Babe (1995),...,Up Close and Personal (1996),"Usual Suspects, The (1995)",Vampire in Brooklyn (1995),Waiting to Exhale (1995),"Walk in the Clouds, A (1995)",Waterworld (1995),When Night Is Falling (1995),White Man's Burden (1995),White Squall (1996),Wild Bill (1995)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ace Ventura: When Nature Calls (1995),1.000000,0.159271,0.310537,0.000000,0.000000,0.076598,0.379255,0.100258,0.000000,0.316254,...,0.079895,0.323153,0.204717,0.099504,0.270278,0.396349,0.000000,0.132672,0.111833,0.106137
"Amazing Panda Adventure, The (1995)",0.159271,1.000000,0.226492,0.000000,0.000000,0.297044,0.225513,0.155520,0.000000,0.279027,...,0.220323,0.060005,0.423405,0.342997,0.271031,0.275700,0.000000,0.685994,0.513994,0.857493
"American President, The (1995)",0.310537,0.226492,1.000000,0.021642,0.000000,0.000000,0.529211,0.152078,0.000000,0.406007,...,0.321909,0.353543,0.116448,0.188667,0.200328,0.336156,0.188667,0.188667,0.335735,0.264133
Angels and Insects (1995),0.000000,0.000000,0.021642,1.000000,0.573539,0.331133,0.127883,0.000000,0.573539,0.195309,...,0.184205,0.151276,0.088499,0.000000,0.000000,0.030734,0.000000,0.000000,0.000000,0.000000
Anne Frank Remembered (1995),0.000000,0.000000,0.000000,0.573539,1.000000,0.577350,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Waterworld (1995),0.396349,0.275700,0.336156,0.030734,0.000000,0.061876,0.426612,0.253766,0.053586,0.410072,...,0.217999,0.304672,0.198445,0.080380,0.291109,1.000000,0.000000,0.160759,0.205772,0.257215
When Night Is Falling (1995),0.000000,0.000000,0.188667,0.000000,0.000000,0.000000,0.114344,0.000000,0.000000,0.000000,...,0.000000,0.107657,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
White Man's Burden (1995),0.132672,0.685994,0.188667,0.000000,0.000000,0.000000,0.114344,0.000000,0.000000,0.151348,...,0.321173,0.026914,0.617213,0.500000,0.395092,0.160759,0.000000,1.000000,0.468293,0.800000
White Squall (1996),0.111833,0.513994,0.335735,0.000000,0.000000,0.000000,0.214186,0.113243,0.000000,0.269326,...,0.150403,0.093268,0.289037,0.234146,0.185019,0.205772,0.000000,0.468293,1.000000,0.599415


### define top_10_similar_movies function to get top 10 similar movies to a movie

In [11]:
def top_10_similar_movies(movie_title, n=10):
    ''' 
    returns similar movies to a given movie.

    Args:
    movie_title: The movie title of desires movie.
    n: The number of similar movies to return.

    Returns:
    A list of movie titles that are most similar to the given movie.
    '''
    # Get the index of the movie title in the ratings matrix
    movie_index = matrix.columns.get_loc(movie_title)

    # Get the similarity scores for the given movie index
    sim_scores = list(enumerate(cosine_similarity_matrix[movie_index]))

    # Sort the movies based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top n similar movies
    top_movies = [matrix.columns[i] for i in [x[0] for x in sim_scores[1:n+1]]]

    return top_movies

### 2. get 10 similar movies to "Toy Story (1995)" and "Waiting to Exhale (1995)"

In [12]:
pd.DataFrame(top_10_similar_movies("Toy Story (1995)"))

Unnamed: 0,0
0,Apollo 13 (1995)
1,Braveheart (1995)
2,"Usual Suspects, The (1995)"
3,Babe (1995)
4,Happy Gilmore (1996)
5,Die Hard: With a Vengeance (1995)
6,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
7,Seven (a.k.a. Se7en) (1995)
8,Waterworld (1995)
9,Batman Forever (1995)


In [13]:
pd.DataFrame(top_10_similar_movies("Waiting to Exhale (1995)"))

Unnamed: 0,0
0,Before and After (1996)
1,Mighty Morphin Power Rangers: The Movie (1995)
2,Before the Rain (Pred dozhdot) (1994)
3,Bushwhacked (1995)
4,"Cure, The (1995)"
5,Georgia (1995)
6,Gordy (1995)
7,Hideaway (1995)
8,Jefferson in Paris (1995)
9,Mad Love (1995)


### 3. get 3 movie recommendations for user with userId = 200

In [14]:
matrix_user = df_200.pivot_table(index="title", columns="userId", values="rating")
matrix_user.head()

userId,1,3,4,5,6,7,8,9,11,12,...,214,215,216,217,218,219,220,221,222,223
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ace Ventura: When Nature Calls (1995),,,,,2.0,,,,,,...,,,,1.0,,2.5,,,3.5,
"Amazing Panda Adventure, The (1995)",,,,,4.0,,,,,,...,,,,,,,,,,
"American President, The (1995)",,,,,4.0,,4.0,,,,...,,,,,,,,,,
Angels and Insects (1995),,,,,,,,,,,...,,,,,,,,,,
Anne Frank Remembered (1995),,,,,,,,,,,...,,,,,,,,,,


In [15]:
cosine_similarity_matrix_user = cosine_similarity(matrix_user.fillna(0))
print(cosine_similarity_matrix.shape)
cosine_similarity_matrix_user = pd.DataFrame(cosine_similarity_matrix_user, columns=matrix_user.index).set_index(matrix_user.index)
cosine_similarity_matrix_user.head()

(200, 200)


title,Ace Ventura: When Nature Calls (1995),"Amazing Panda Adventure, The (1995)","American President, The (1995)",Angels and Insects (1995),Anne Frank Remembered (1995),Antonia's Line (Antonia) (1995),Apollo 13 (1995),Assassins (1995),"Awfully Big Adventure, An (1995)",Babe (1995),...,Up Close and Personal (1996),"Usual Suspects, The (1995)",Vampire in Brooklyn (1995),Waiting to Exhale (1995),"Walk in the Clouds, A (1995)",Waterworld (1995),When Night Is Falling (1995),White Man's Burden (1995),White Squall (1996),Wild Bill (1995)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ace Ventura: When Nature Calls (1995),1.0,0.159271,0.310537,0.0,0.0,0.076598,0.379255,0.100258,0.0,0.316254,...,0.079895,0.323153,0.204717,0.099504,0.270278,0.396349,0.0,0.132672,0.111833,0.106137
"Amazing Panda Adventure, The (1995)",0.159271,1.0,0.226492,0.0,0.0,0.297044,0.225513,0.15552,0.0,0.279027,...,0.220323,0.060005,0.423405,0.342997,0.271031,0.2757,0.0,0.685994,0.513994,0.857493
"American President, The (1995)",0.310537,0.226492,1.0,0.021642,0.0,0.0,0.529211,0.152078,0.0,0.406007,...,0.321909,0.353543,0.116448,0.188667,0.200328,0.336156,0.188667,0.188667,0.335735,0.264133
Angels and Insects (1995),0.0,0.0,0.021642,1.0,0.573539,0.331133,0.127883,0.0,0.573539,0.195309,...,0.184205,0.151276,0.088499,0.0,0.0,0.030734,0.0,0.0,0.0,0.0
Anne Frank Remembered (1995),0.0,0.0,0.0,0.573539,1.0,0.57735,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def recommeder(picked_userid, number_of_similar_items=5, n=3):
  ''' 
    returns n recommended movies for user of id "picked_userid".

    Args:
    picked_userid: The id of the user that need recommendations.
    number_of_similar_items: number of movies used to calculate the unwatched movie rating.
    n: The number of recommended movies to return.

    Returns:
    A list of movie titles and their predicted ratings recommended to a user.
    '''
  # Movies that the target user has not watched
  picked_userid_unwatched = pd.DataFrame(matrix_user[picked_userid].isna()).reset_index()
  picked_userid_unwatched = picked_userid_unwatched[picked_userid_unwatched[picked_userid]==True]['title'].values.tolist()
  # Movies that the target user has watched
  picked_userid_watched = pd.DataFrame(matrix_user[picked_userid].dropna(axis=0, how='all')\
                            .sort_values(ascending=False))\
                            .reset_index()\
                            .rename(columns={picked_userid:'rating'})
  
  # remove movies that has no similarities
  picked_userid_unwatched.remove("Blue in the Face (1995)")
  picked_userid_unwatched.remove("Lamerica (1994)")
  picked_userid_unwatched.remove("Party Girl (1995)")
  
  # Dictionary to save the unwatched movie and predicted rating pair
  rating_prediction ={}  
  # Loop through unwatched movies  
  for picked_movie in picked_userid_unwatched: 
    
    # Calculate the similarity score of the picked movie iwth other movies
    picked_movie_similarity_score = cosine_similarity_matrix_user[[picked_movie]].reset_index().rename(columns={picked_movie:'similarity_score'})
    # Rank the similarities between the picked user watched movie and the picked unwatched movie.
    picked_userid_watched_similarity = pd.merge(left=picked_userid_watched, 
                                                right=picked_movie_similarity_score, 
                                                on='title', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
    # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
    predicted_rating = round(np.average(picked_userid_watched_similarity['rating'], 
                                        weights=picked_userid_watched_similarity['similarity_score']), 6)
    # Save the predicted rating in the dictionary
    rating_prediction[picked_movie] = predicted_rating
    # Return the top recommended movies
  return sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:n]

In [17]:
recommended_movie = recommeder(picked_userid=200, number_of_similar_items=5, n=3)
recommended_movie

[('Fair Game (1995)', 5.0),
 ('Friday (1995)', 4.375048),
 ('Richard III (1995)', 4.233868)]