In [13]:
# Cold-Start Analysis

# This notebook analyzes how recommendation performance degrades for:
# - New users , Infrequent users & Less-rated movies


In [14]:
import pandas as pd
import numpy as np

ratings = pd.read_csv("../data/ratings.csv")
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [15]:
user_activity = ratings.groupby("userId").size().reset_index(name="num_ratings")
user_activity.head()


Unnamed: 0,userId,num_ratings
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [16]:
def user_bucket(n):
    if n <= 10:
        return "Cold User"
    elif n <= 50:
        return "Light User"
    else:
        return "Heavy User"

user_activity["bucket"] = user_activity["num_ratings"].apply(user_bucket)
user_activity["bucket"].value_counts()


bucket
Heavy User    378
Light User    232
Name: count, dtype: int64

In [17]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

# Prepare data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(
    ratings[['userId', 'movieId', 'rating']],
    reader
)

trainset, testset = train_test_split(
    data, test_size=0.2, random_state=42
)

# Train SVD
svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd.fit(trainset)

# Predictions
predictions = svd.test(testset)

# Convert to DataFrame
pred_df = pd.DataFrame(predictions)
pred_df["abs_error"] = abs(pred_df["r_ui"] - pred_df["est"])

pred_df.head()


Unnamed: 0,uid,iid,r_ui,est,details,abs_error
0,140,6765,3.5,3.381675,{'was_impossible': False},0.118325
1,603,290,4.0,3.585833,{'was_impossible': False},0.414167
2,438,5055,4.0,3.065738,{'was_impossible': False},0.934262
3,433,164179,5.0,3.607901,{'was_impossible': False},1.392099
4,474,5114,4.0,3.312712,{'was_impossible': False},0.687288


In [18]:
pred_df = pred_df.merge(
    user_activity[["userId", "bucket"]],
    left_on="uid",
    right_on="userId",
    how="left"
)


In [19]:
pred_df.groupby("bucket")["abs_error"].mean()

## Cold-Start Findings

# - Users with fewer interactions have significantly higher prediction error.
# - Collaborative filtering relies on historical interactions.
# - Even SVD fails when no latent factors can be learned.


bucket
Heavy User    0.669667
Light User    0.730277
Name: abs_error, dtype: float64