Libraries import:

In [17]:
import pandas as pd
import numpy as np
import scipy.stats

import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

Import ratings:

In [18]:
ratings = pd.read_csv('ratings.csv', sep=';', encoding='utf-8')
ratings['rating'] = ratings['rating'].str.replace(',', '.').astype(float)
ratings = ratings.head(1000000)
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   userId   1000000 non-null  int64  
 1   movieId  1000000 non-null  int64  
 2   rating   1000000 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 22.9 MB


Unnamed: 0,userId,movieId,rating
0,91882,356,5.0
1,91882,364,3.0
2,91882,435,3.0
3,91882,440,3.0
4,91882,454,4.0


Import movies:

In [19]:
movies = pd.read_csv('recomendMovies.csv', sep=';', encoding='utf-16')
movies = movies[['id', 'title', 'genres']]
movies.rename(columns={'id':'movieId'}, inplace=True)
movies.info()
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56239 entries, 0 to 56238
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  56239 non-null  int64 
 1   title    56239 non-null  object
 2   genres   55255 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


Unnamed: 0,movieId,title,genres
0,4470,Ariel,Drama-Comedy-Crime
1,18,Four Rooms,Crime-Comedy
2,260,Star Wars,Adventure-Action-Science Fiction
3,6377,Finding Nemo,Animation-Family
4,356,Forrest Gump,Comedy-Drama-Romance


Merge dataframes

In [20]:
df = pd.merge(ratings, movies, on='movieId', how='inner')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999949 entries, 0 to 999948
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   999949 non-null  int64  
 1   movieId  999949 non-null  int64  
 2   rating   999949 non-null  float64
 3   title    999949 non-null  object 
 4   genres   999765 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 45.8+ MB


Unnamed: 0,userId,movieId,rating,title,genres
0,91882,356,5.0,Forrest Gump,Comedy-Drama-Romance
1,91883,356,4.0,Forrest Gump,Comedy-Drama-Romance
2,91885,356,5.0,Forrest Gump,Comedy-Drama-Romance
3,91889,356,0.5,Forrest Gump,Comedy-Drama-Romance
4,91892,356,2.0,Forrest Gump,Comedy-Drama-Romance


Get movies with number of votes more than 300:

In [21]:
agg_ratings = df.groupby('title').agg(mean_rating=('rating', 'mean'), number_of_ratings=('rating', 'count')).reset_index()
agg_ratings_GT100 = agg_ratings[agg_ratings['number_of_ratings'] > 800]
agg_ratings.info()
agg_ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19916 entries, 0 to 19915
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              19916 non-null  object 
 1   mean_rating        19916 non-null  float64
 2   number_of_ratings  19916 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 466.9+ KB


Unnamed: 0,title,mean_rating,number_of_ratings
0,#1 Cheerleader Camp,4.0,1
1,#realityhigh,4.5,1
2,$,3.0,2
3,$5 a Day,2.75,2
4,$9.99,3.0,2


Get top rated movies:

In [22]:
df_GT100 = pd.merge(df, agg_ratings_GT100[['title']], on='title', how='inner')
df_GT100.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,91882,356,5.0,Forrest Gump,Comedy-Drama-Romance
1,91883,356,4.0,Forrest Gump,Comedy-Drama-Romance
2,91885,356,5.0,Forrest Gump,Comedy-Drama-Romance
3,91889,356,0.5,Forrest Gump,Comedy-Drama-Romance
4,91892,356,2.0,Forrest Gump,Comedy-Drama-Romance


Create matrix where users are rows and movies are columns:

In [23]:
matrix = df_GT100.pivot_table(index='userId', columns='title', values='rating')
matrix.info()
matrix.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6430 entries, 91882 to 98400
Columns: 206 entries, 2001: A Space Odyssey to X-Men
dtypes: float64(206)
memory usage: 10.2 MB


title,2001: A Space Odyssey,A Beautiful Mind,A Bug's Life,A Clockwork Orange,Ace Ventura: Pet Detective,Ace Ventura: When Nature Calls,Aladdin,Alien,Aliens,Amadeus,...,Twister,Up,V for Vendetta,WALL·E,Waterworld,When Harry Met Sally...,While You Were Sleeping,Who Framed Roger Rabbit,Willy Wonka & the Chocolate Factory,X-Men
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91882,,,,,,,4.0,,,,...,,,,,,,,,,
91883,,4.5,,4.0,,,,4.0,,,...,,4.0,4.0,4.0,2.0,,,,,3.5
91884,,,,,,,,,,,...,,4.5,,4.0,,,,,,
91885,,,,,,,,,,,...,,,,,,,4.0,,,
91886,,,,,,,,,,4.0,...,,,,,,,,,,


Normalize users

In [24]:
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 'rows')
matrix_norm.head()

title,2001: A Space Odyssey,A Beautiful Mind,A Bug's Life,A Clockwork Orange,Ace Ventura: Pet Detective,Ace Ventura: When Nature Calls,Aladdin,Alien,Aliens,Amadeus,...,Twister,Up,V for Vendetta,WALL·E,Waterworld,When Harry Met Sally...,While You Were Sleeping,Who Framed Roger Rabbit,Willy Wonka & the Chocolate Factory,X-Men
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91882,,,,,,,0.0,,,,...,,,,,,,,,,
91883,,0.784188,,0.284188,,,,0.284188,,,...,,0.284188,0.284188,0.284188,-1.715812,,,,,-0.215812
91884,,,,,,,,,,,...,,0.38,,-0.12,,,,,,
91885,,,,,,,,,,,...,,,,,,,0.588235,,,
91886,,,,,,,,,,-0.583333,...,,,,,,,,,,


Identify similar users

In [25]:
user_similarity = matrix_norm.T.corr()
user_similarity.head()

userId,91882,91883,91884,91885,91886,91887,91888,91889,91890,91891,...,98391,98392,98393,98394,98395,98396,98397,98398,98399,98400
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91882,1.0,,,0.745499,,0.866025,,,,,...,-0.175035,,0.225877,0.874147,1.0,-0.866025,1.0,0.866025,,0.43823
91883,,1.0,0.547373,-0.367607,0.43082,-0.456775,-0.307794,-0.155043,-0.473626,0.702439,...,-0.129903,0.11547,-0.486828,0.323695,0.018132,-0.707107,0.0,0.267307,-0.320256,0.439111
91884,,0.547373,1.0,,-0.333333,-0.25,-1.0,,-0.68313,,...,0.301511,1.0,-0.5,0.0,1.0,,,0.366667,,0.106112
91885,0.745499,-0.367607,,1.0,,0.0,,,,,...,0.835229,,0.511766,0.192961,,-1.0,1.0,-0.944911,,-0.093076
91886,,0.43082,-0.333333,,1.0,-0.091287,,,,1.0,...,-0.569803,,-1.0,0.57735,,,,-0.316228,,0.316228


Pick user

In [26]:
# Pick a user ID
picked_userid = 91882
# Remove picked user ID from the candidate list
user_similarity.drop(index=picked_userid, inplace=True)
# Take a look at the data
user_similarity.head()

userId,91882,91883,91884,91885,91886,91887,91888,91889,91890,91891,...,98391,98392,98393,98394,98395,98396,98397,98398,98399,98400
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91883,,1.0,0.547373,-0.367607,0.43082,-0.456775,-0.307794,-0.155043,-0.473626,0.702439,...,-0.129903,0.11547,-0.486828,0.323695,0.018132,-0.707107,0.0,0.267307,-0.320256,0.439111
91884,,0.547373,1.0,,-0.333333,-0.25,-1.0,,-0.68313,,...,0.301511,1.0,-0.5,0.0,1.0,,,0.366667,,0.106112
91885,0.745499,-0.367607,,1.0,,0.0,,,,,...,0.835229,,0.511766,0.192961,,-1.0,1.0,-0.944911,,-0.093076
91886,,0.43082,-0.333333,,1.0,-0.091287,,,,1.0,...,-0.569803,,-1.0,0.57735,,,,-0.316228,,0.316228
91887,0.866025,-0.456775,-0.25,0.0,-0.091287,1.0,0.662266,,,,...,0.232495,-0.5,0.880705,0.054153,,0.333333,-0.918559,-0.01942,,-0.14935


In [27]:
# Number of similar users
n = 10
# User similarity threashold
user_similarity_threshold = 0.3
# Get top n similar users
similar_users = user_similarity[user_similarity[picked_userid]>user_similarity_threshold][picked_userid].sort_values(ascending=False)[:n]
# Print out top n similar users
print(f'The similar users for user {picked_userid} are', similar_users)

The similar users for user 91882 are userId
97478    1.0
94124    1.0
93864    1.0
95703    1.0
95676    1.0
97284    1.0
97305    1.0
95665    1.0
95651    1.0
97321    1.0
Name: 91882, dtype: float64


In [28]:
# Movies that the target user has watched
picked_userid_watched = matrix_norm[matrix_norm.index == picked_userid].dropna(axis=1, how='all')
picked_userid_watched

title,Aladdin,Batman,Dances with Wolves,Forrest Gump,Ghost,Mrs. Doubtfire,Pretty Woman,Schindler's List,Sleepless in Seattle,The Firm,The Lion King,The Silence of the Lambs
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
91882,0.0,-1.0,0.0,1.0,0.0,0.0,-1.0,1.0,0.0,0.0,-1.0,1.0


In [29]:
# Movies that similar users watched. Remove movies that none of the similar users have watched
similar_user_movies = matrix_norm[matrix_norm.index.isin(similar_users.index)].dropna(axis=1, how='all')
similar_user_movies

title,2001: A Space Odyssey,A Bug's Life,A Clockwork Orange,Ace Ventura: Pet Detective,Ace Ventura: When Nature Calls,Aladdin,Alien,Aliens,Amadeus,American Beauty,...,Twelve Monkeys,Twister,Up,V for Vendetta,WALL·E,Waterworld,When Harry Met Sally...,While You Were Sleeping,Who Framed Roger Rabbit,X-Men
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93864,,0.136364,,,,,,,,0.136364,...,,,,,,,,,0.136364,0.136364
94124,,,,,,,1.573529,1.573529,,,...,0.573529,0.073529,,-0.926471,,,,,,
95651,,,,,,,,-0.4,-0.4,,...,0.6,,,,,,0.6,,,
95665,,,,,,,,,,,...,,,,,,,,,,
95676,,,,,,,,,,,...,,-1.7,,,,,,,1.3,
95703,,-0.388889,,,,,,,,,...,,0.611111,0.111111,,,,,0.111111,,
97284,,,,-0.625,-0.625,1.375,,,,,...,-0.625,,,,,-1.625,,-0.625,,
97305,,,,,,,0.878049,0.878049,,,...,,,,,,,0.878049,,,
97321,-0.12069,,1.37931,,,-0.12069,,,,0.37931,...,,,0.87931,,0.87931,,,,,
97478,,,,,,,,,,,...,,,,0.537037,,,,,,0.037037


In [30]:
# Remove the watched movie from the movie list
similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
# Take a look at the data
similar_user_movies

title,2001: A Space Odyssey,A Bug's Life,A Clockwork Orange,Ace Ventura: Pet Detective,Ace Ventura: When Nature Calls,Alien,Aliens,Amadeus,American Beauty,American History X,...,Twelve Monkeys,Twister,Up,V for Vendetta,WALL·E,Waterworld,When Harry Met Sally...,While You Were Sleeping,Who Framed Roger Rabbit,X-Men
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93864,,0.136364,,,,,,,0.136364,,...,,,,,,,,,0.136364,0.136364
94124,,,,,,1.573529,1.573529,,,,...,0.573529,0.073529,,-0.926471,,,,,,
95651,,,,,,,-0.4,-0.4,,0.6,...,0.6,,,,,,0.6,,,
95665,,,,,,,,,,,...,,,,,,,,,,
95676,,,,,,,,,,,...,,-1.7,,,,,,,1.3,
95703,,-0.388889,,,,,,,,,...,,0.611111,0.111111,,,,,0.111111,,
97284,,,,-0.625,-0.625,,,,,,...,-0.625,,,,,-1.625,,-0.625,,
97305,,,,,,0.878049,0.878049,,,,...,,,,,,,0.878049,,,
97321,-0.12069,,1.37931,,,,,,0.37931,0.87931,...,,,0.87931,,0.87931,,,,,
97478,,,,,,,,,,,...,,,,0.537037,,,,,,0.037037


In [32]:
# A dictionary to store item scores
item_score = {}
# Loop through items
for i in similar_user_movies.columns:
  # Get the ratings for movie i
  movie_rating = similar_user_movies[i]
  # Create a variable to store the score
  total = 0
  # Create a variable to store the number of scores
  count = 0
  # Loop through similar users
  for u in similar_users.index:
    # If the movie has rating
    if pd.isna(movie_rating[u]) == False:
      # Score is the sum of user similarity score multiply by the movie rating
      score = similar_users[u] * movie_rating[u]
      # Add the score to the total score for the movie so far
      total += score
      # Add 1 to the count
      count +=1
  # Get the average score for the item
  item_score[i] = total / count
# Convert dictionary to pandas dataframe
item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])

# Sort the movies by score
ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)
# Select top m movies
m = 10
ranked_item_score.head(m)

Unnamed: 0,movie,movie_score
2,A Clockwork Orange,1.37931
22,Beauty and the Beast,1.375
71,Men in Black,1.3
5,Alien,1.225789
88,Rear Window,1.205556
114,The Blues Brothers,1.089024
11,Apocalypse Now,0.884615
80,One Flew Over the Cuckoo's Nest,0.884615
153,WALL·E,0.87931
89,Requiem for a Dream,0.87931
