In [1]:
# To analyze where your best model (Surprise SVD) fails and succeeds ; This is what converts a model into a research project.

In [2]:
import pandas as pd
import numpy as np

from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split


In [3]:
ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")

ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(
    ratings[['userId', 'movieId', 'rating']],
    reader
)


In [5]:
trainset, testset = train_test_split(
    data,
    test_size=0.2,
    random_state=42
)


In [6]:
svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11e0bc290>

In [7]:
predictions = svd.test(testset)

accuracy.rmse(predictions)


RMSE: 0.8775


0.8774680781839198

In [8]:
pred_df = pd.DataFrame(predictions)

pred_df.head()


Unnamed: 0,uid,iid,r_ui,est,details
0,140,6765,3.5,3.381675,{'was_impossible': False}
1,603,290,4.0,3.585833,{'was_impossible': False}
2,438,5055,4.0,3.065738,{'was_impossible': False}
3,433,164179,5.0,3.607901,{'was_impossible': False}
4,474,5114,4.0,3.312712,{'was_impossible': False}


In [9]:
pred_df['abs_error'] = abs(pred_df['r_ui'] - pred_df['est'])

pred_df.head()


Unnamed: 0,uid,iid,r_ui,est,details,abs_error
0,140,6765,3.5,3.381675,{'was_impossible': False},0.118325
1,603,290,4.0,3.585833,{'was_impossible': False},0.414167
2,438,5055,4.0,3.065738,{'was_impossible': False},0.934262
3,433,164179,5.0,3.607901,{'was_impossible': False},1.392099
4,474,5114,4.0,3.312712,{'was_impossible': False},0.687288


In [10]:
pred_df.sort_values('abs_error', ascending=False).head(10)


Unnamed: 0,uid,iid,r_ui,est,details,abs_error
6252,543,35836,0.5,4.492607,{'was_impossible': False},3.992607
13184,105,4027,0.5,4.466533,{'was_impossible': False},3.966533
19761,573,44199,0.5,4.427108,{'was_impossible': False},3.927108
16133,413,2858,1.0,4.922919,{'was_impossible': False},3.922919
10071,594,799,0.5,4.35707,{'was_impossible': False},3.85707
6902,224,969,1.0,4.827925,{'was_impossible': False},3.827925
19570,573,1127,0.5,4.294505,{'was_impossible': False},3.794505
10211,258,122886,0.5,4.283742,{'was_impossible': False},3.783742
2425,393,589,0.5,4.263195,{'was_impossible': False},3.763195
11442,594,4902,0.5,4.215855,{'was_impossible': False},3.715855


In [11]:
user_error = (
    pred_df
    .groupby('uid')['abs_error']
    .mean()
    .sort_values(ascending=False)
)

user_error.head(10)


uid
3      2.405241
485    2.064910
77     1.939918
329    1.931596
255    1.795320
259    1.763432
461    1.659858
130    1.613871
500    1.603527
81     1.564909
Name: abs_error, dtype: float64

In [12]:
movie_error = (
    pred_df
    .groupby('iid')['abs_error']
    .mean()
    .sort_values(ascending=False)
)

movie_error.head(10)


iid
5105      3.451762
7700      3.218722
145951    3.101923
2526      3.016440
5560      2.942766
2742      2.942766
1999      2.902032
4412      2.864581
42018     2.849214
4518      2.786976
Name: abs_error, dtype: float64

In [13]:
movie_error_df = movie_error.reset_index()
movie_error_df.columns = ['movieId', 'mean_abs_error']

movie_error_df = movie_error_df.merge(
    movies,
    on='movieId',
    how='left'
)

movie_error_df.head(10)


Unnamed: 0,movieId,mean_abs_error,title,genres
0,5105,3.451762,Don't Look Now (1973),Drama|Horror|Thriller
1,7700,3.218722,"Wages of Fear, The (Salaire de la peur, Le) (1...",Action|Adventure|Drama|Thriller
2,145951,3.101923,Bloodsport: The Dark Kumite (1999),Action|Thriller
3,2526,3.01644,Meteor (1979),Sci-Fi
4,5560,2.942766,À nous la liberté (Freedom for Us) (1931),Comedy|Musical
5,2742,2.942766,Ménage (Tenue de soirée) (1986),Comedy|Drama
6,1999,2.902032,"Exorcist III, The (1990)",Horror
7,4412,2.864581,"Thing with Two Heads, The (1972)",Comedy|Horror|Sci-Fi
8,42018,2.849214,Mrs. Henderson Presents (2005),Comedy|Drama
9,4518,2.786976,The Lair of the White Worm (1988),Comedy|Horror


In [None]:
### Error Analysis Insights

# - Users with fewer ratings tend to have higher prediction errors.
# - Niche or less-rated movies are harder to predict accurately.
# - Popular movies have lower average error due to dense interactions.
# - Even strong models like SVD struggle with cold-start scenarios.
