In [16]:
# Baseline Recommendation Models

## This notebook implements simple baseline recommenders to establish performance benchmarks before applying collaborative filtering models.


In [17]:
import pandas as pd
import numpy as np

ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")
tags = pd.read_csv("../data/tags.csv")



In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)

len(train), len(test)


(80668, 20168)

In [6]:
global_mean = train['rating'].mean()
global_mean


np.float64(3.502572271532702)

In [8]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse_global = np.sqrt(
    mean_squared_error(
        test['rating'],
        test['pred_global']
    )
)

rmse_global

### Global Average Baseline
# - Extremely simple model
# - No personalization
# - Acts as a minimum performance benchmark


np.float64(1.0488405992661316)

In [9]:
movie_means = train.groupby('movieId')['rating'].mean()
movie_means.head()


movieId
1    3.893678
2    3.373626
3    3.162500
4    2.250000
5    2.955882
Name: rating, dtype: float64

In [11]:
from sklearn.metrics import mean_squared_error
import numpy as np

test['pred_movie'] = test['movieId'].map(movie_means)

test['pred_movie'] = test['pred_movie'].fillna(global_mean)

rmse_movie = np.sqrt(
    mean_squared_error(
        test['rating'],
        test['pred_movie']
    )
)

rmse_movie


### Movie Average Baseline
# - Personalization at the item level
# - Performs better than global average
# - Still ignores user preferences



np.float64(0.9827389937822489)

In [12]:
popular_movies = (
    train.groupby('movieId')
    .size()
    .sort_values(ascending=False)
    .head(10)
)

popular_movies


movieId
356     276
318     257
296     256
593     226
2571    214
260     194
110     189
480     185
589     182
527     174
dtype: int64

In [13]:
movies.set_index('movieId').loc[popular_movies.index][['title']]


Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
356,Forrest Gump (1994)
318,"Shawshank Redemption, The (1994)"
296,Pulp Fiction (1994)
593,"Silence of the Lambs, The (1991)"
2571,"Matrix, The (1999)"
260,Star Wars: Episode IV - A New Hope (1977)
110,Braveheart (1995)
480,Jurassic Park (1993)
589,Terminator 2: Judgment Day (1991)
527,Schindler's List (1993)


In [14]:
pd.DataFrame({
    "Model": ["Global Average", "Movie Average"],
    "RMSE": [rmse_global, rmse_movie]
})

## Baseline Model Observations

# - Movie-average baseline outperforms global-average baseline.
# - Neither model captures user-specific preferences.
# - These results justify the need for collaborative filtering models.


Unnamed: 0,Model,RMSE
0,Global Average,1.048841
1,Movie Average,0.982739
