In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [31]:
df = pd.read_csv('ratings_small.csv')


## Quick EDA

In [32]:
df.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
96673,647,1354,5.0,947292957
31403,227,2321,1.0,913134235
53840,388,1183,4.0,973840441
48619,355,5952,5.0,1130103479
55027,394,31364,0.5,1297603425
58532,426,2571,4.5,1310374252
66526,468,6373,2.0,1296196987
34505,247,69,4.0,953189620
31609,231,111,4.0,977005465
70628,492,1233,3.0,898108428


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [34]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


## Convert timestamp to datetime

In [35]:
df['date']=pd.to_datetime(df['timestamp'], unit='s')

In [36]:
df.drop('timestamp', axis=1, inplace=True)

## Split train test

Our business problem is to predict a rate for each user, rates for unseen movies, and recommend the best fit for a specific user. So our target column is rating.

In [37]:
target = 'rating'
features = ['userId', 'movieId','date']

In [38]:
y= df[target]
X= df[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=444)

## Baseline model

There is 3 kind of recommendation system : 
- Demographic Filtering: this approach is based on movies popularity, thus it will offer to every users the same recommention.
- Content based Filtering: this approach is based on metadata of the movie; actors, cast, directors genre etc.. It will offer to users similar movies they liked
- Collaborative Filtering: finally this approach is based on other users rating, and will match users will similar ratings/tastes together and offer recommendations based on these similarities.

As baseline, we will build a demographic filtering, a simple and basic model, then later we will try to build a model based on both other kind of filtering: content and collaborative.

So, to build our demographic filtering model, we will group by moveId our dataframe, and add a mean rating per movie.
To build our model, we need to add to our X_train the rating column.

In [39]:
df_train = pd.concat([X_train, y_train], axis=1)

In [40]:
# count number of user that vote per movie
user_count = df_train.groupby('movieId')['userId'].count()

In [41]:
# mean of movie rating
rating_mean = df_train.groupby('movieId')['rating'].mean()

In [42]:
# create one df with the user count and the rating average per movie
df_demo = pd.concat([user_count, rating_mean], axis=1)
df_demo.rename(columns = {'userId':'user_count', 'rating':'rating_mean'}, inplace=True)

In [43]:
df_demo.head()

Unnamed: 0_level_0,user_count,rating_mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,187,3.885027
2,88,3.454545
3,49,3.142857
4,10,2.35
5,46,3.315217


The issue with this rating is that we do not take into account the number of vote per movie, which can have a impact on the rating. So to get a fair rating, we can use IMDB's weighted rating (wr):
$$WR = (\frac{v}{v+m})R+ (\frac{m}{m+v})C$$

with:

v: number of vote of this movie

m: minimum number of vote to be in the list (here 1)

R: rating mean for this movie

C: rating mean of the database

In [44]:
C = df_train['rating'].mean()
def weighted_rating(x):
    v = x['user_count']
    R = x['rating_mean']
    # Calculation based on the IMDB formula
    return (v/(v+1) * R) + (1/(1+v) * C)

In [45]:
# add weighted rating to the demo df
df_demo['weighted_rating']=df_demo.apply(weighted_rating, axis=1)

Weighted rating is our prediction. Now we can compute some metrics to check our model, and use these metrics from now on as baseline.

## Metrics

This a regression problem, we will calculate RMSE and R2. The true label are the rating per movie, per user. Our prediction is the weighted rating.

In [46]:
# add the prediction to the df train, to have the weighted rating per user and per movie
df_train = pd.merge(df_train, df_demo['weighted_rating'], on="movieId")

In [47]:
y_true = df_train['rating']
y_pred = df_train['weighted_rating']

In [48]:
# compute rmse and r2
rmse = mean_squared_error(y_true, y_pred, squared=False)
R2 = r2_score(y_true, y_pred)
print('For the train set, we have rmse={} and R2={}'.format(rmse, R2))

For the train set, we have rmse=0.8983812277918807 and R2=0.27566346580221734


Let check for the test set

In [49]:
# add the prediction to the df test, to have the weighted rating per user and per movie
df_test = pd.merge(X_test, df_demo['weighted_rating'], on="movieId", how='left')

Some movies from the test set are not in the train set, so we will use the X_train global average rating as prediction.

In [50]:
# fill na with df_train mean
df_test['weighted_rating']=df_test['weighted_rating'].fillna(C)

In [51]:
y_test_true = y_test
y_test_pred = df_test['weighted_rating']

In [52]:
y_test_true.shape, y_test_pred.shape

((20001,), (20001,))

In [53]:
rmse = mean_squared_error(y_test_true, y_test_pred, squared=False)
R2 = r2_score(y_test_true, y_test_pred)
print('For the train set, we have rmse={} and R2={}'.format(rmse, R2))

For the train set, we have rmse=0.9913519353869503 and R2=0.13823706575577988


In [54]:
df_test

Unnamed: 0,userId,movieId,date,weighted_rating
0,111,1617,2004-10-10 17:51:48,4.149947
1,214,1961,2001-01-01 20:58:03,3.939847
2,336,1269,2001-07-22 18:41:50,3.946106
3,525,2,2002-06-24 14:36:34,3.455560
4,472,1489,1999-10-04 02:38:26,3.544861
...,...,...,...,...
19996,580,7147,2006-12-26 18:17:30,3.819201
19997,659,592,1996-06-12 16:47:20,3.435679
19998,537,235,1997-11-14 11:26:14,3.910897
19999,110,592,1996-08-15 09:12:40,3.435679
