In [1]:
import pandas as pd
import numpy as np

In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [11]:
links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [12]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [13]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [14]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [20]:
# popularity ratings
# Let's group places by rating, and look at their average rating. This is an explicit rating given by users.

rating = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
rating.sort_values("rating", ascending=False).head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
88448,5.0
100556,5.0
143031,5.0
143511,5.0
143559,5.0


In [22]:
ratings.query("movieId==88448")

Unnamed: 0,userId,movieId,rating,timestamp
77875,483,88448,5.0,1315437602


In [27]:
# We can also look at how many times each movie has received a rating. The ratings count is an implicit rating.
rating['ratings_count'] = ratings.groupby('movieId')['rating'].count()
rating.sort_values('ratings_count', ascending=False).head()

Unnamed: 0_level_0,rating,ratings_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,4.164134,329
318,4.429022,317
296,4.197068,307
593,4.16129,279
2571,4.192446,278


In [31]:
top_popular_movie = rating.sort_values('ratings_count', ascending=False).head(1).index[0]
# most popular movies names
movies[movies['movieId']==top_popular_movie]

Unnamed: 0,movieId,title,genres
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War


In [32]:
# Find a hybrid system to sort movies, so that you can recommend the "best"
# movies that are both high rated and popular.

n=5
new_frame = (
ratings.groupby('movieId').agg(avg_rating=('rating','mean'),n_ratings=('userId','count'))
                        .query(f'n_ratings > {n}')
                        .nlargest(10,'avg_rating')
)
new_frame.head(3)

Unnamed: 0_level_0,avg_rating,n_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
177593,4.75,8
2239,4.666667,6
1041,4.590909,11


In [34]:
new_frame.merge(movies,how='left',on='movieId').merge(tags,how='left',on='movieId').head(2)

Unnamed: 0,movieId,avg_rating,n_ratings,title,genres,userId,tag,timestamp
0,177593,4.75,8,"Three Billboards Outside Ebbing, Missouri (2017)",Crime|Drama,,,
1,2239,4.666667,6,Swept Away (Travolti da un insolito destino ne...,Comedy|Drama,,,


In [None]:
# Making Recommendations Based on Correlation





In [37]:
# user-based recommendation
# Create the big users-items table.

users_items = pd.pivot_table(data=ratings, 
                                 values='rating', 
                                 index='userId', 
                                 columns='movieId')

users_items.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [38]:
# replace NaNs with zeros

users_items.fillna(0, inplace=True)
users_items.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#  Compute cosine similarities
from sklearn.metrics.pairwise import cosine_similarity

user_similarities = pd.DataFrame(cosine_similarity(users_items),
                                 columns=users_items.index, 
                                 index=users_items.index)
user_similarities.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [43]:
# . Compute the weights

user_id = 603

weights = (
    user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
          )
weights.head(6)

userId
1    0.003259
2    0.000177
3    0.000368
4    0.004531
5    0.001621
6    0.001442
Name: 603, dtype: float64

In [44]:
# Find movies user 603 has not rated.
# We will exclude our user, since we don't want to include them on the weights.

users_items.loc[user_id,:]==0

movieId
1         False
2          True
3          True
4          True
5          True
          ...  
193581     True
193583     True
193585     True
193587     True
193609     True
Name: 603, Length: 9724, dtype: bool

In [46]:
# select restaurants that the inputed user has not visited
not_watched_movies = users_items.loc[users_items.index!=user_id, users_items.loc[user_id,:]==0]
not_watched_movies.T

userId,1,2,3,4,5,6,7,8,9,10,...,600,601,602,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,4.0,0.0,4.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# Compute the ratings user 603 would give to those unrated movies.

not_watched_movies.T.dot(weights)

movieId
2         0.770990
3         0.378475
4         0.037113
5         0.261547
7         0.341309
            ...   
193581    0.000291
193583    0.000254
193585    0.000254
193587    0.000254
193609    0.003072
Length: 8781, dtype: float64

In [48]:
# dot product between the not-visited-restaurants and the weights
weighted_averages = pd.DataFrame(not_watched_movies.T.dot(weights), columns=["predicted_rating"])
weighted_averages

Unnamed: 0_level_0,predicted_rating
movieId,Unnamed: 1_level_1
2,0.770990
3,0.378475
4,0.037113
5,0.261547
7,0.341309
...,...
193581,0.000291
193583,0.000254
193585,0.000254
193587,0.000254


In [50]:
# Find the top 5 movies from the rating predictions

recommendations = weighted_averages.merge(movies, left_index=True, right_on="movieId")
recommendations.sort_values("predicted_rating", ascending=False).head()

Unnamed: 0,predicted_rating,movieId,title,genres
277,2.620249,318,"Shawshank Redemption, The (1994)",Crime|Drama
46,2.011586,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
507,1.966673,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi
418,1.940126,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
43,1.79498,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller


In [51]:
def user_similarity(userID="U1001",n=10,user_movies=users_items,rest_names=movies):
  userID=input("What is your userID ")
  n=int(input("How many movies do you want to get "))
  user_similarities = pd.DataFrame(cosine_similarity(user_movies),
                                 columns=user_movies.index, 
                                 index=user_movies.index)
  weights = (
    user_similarities.query("userId!=@userId")[userId] / sum(user_similarities.query("userId!=@userId")[userId])
          )
  not_watched_movies = user_movies.loc[user_movies.index!=userId, user_movies.loc[userId,:]==0]
  weighted_averages = pd.DataFrame(not_watched_movies.T.dot(weights), columns=["predicted_rating"])
  recommendations = weighted_averages.merge(rest_names, left_index=True, right_on="movieId")
  return recommendations.sort_values("predicted_rating", ascending=False).head(n)