In [7]:
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader

ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")


In [8]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(
    ratings[['userId', 'movieId', 'rating']],
    reader
)

trainset = data.build_full_trainset()

svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd.fit(trainset)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(
    ratings[['userId', 'movieId', 'rating']],
    reader
)

trainset = data.build_full_trainset()

svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12073e810>

In [9]:
def recommend_movies(user_id, n=10):
    all_movies = movies['movieId'].unique()
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].values

    unseen_movies = [m for m in all_movies if m not in rated_movies]

    preds = [
        (movie_id, svd.predict(user_id, movie_id).est)
        for movie_id in unseen_movies
    ]

    top_preds = sorted(preds, key=lambda x: x[1], reverse=True)[:n]

    return movies[movies['movieId'].isin([p[0] for p in top_preds])]


In [10]:
recommend_movies(user_id=1, n=10)


Unnamed: 0,movieId,title,genres
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
659,858,"Godfather, The (1972)",Crime|Drama
681,899,Singin' in the Rain (1952),Comedy|Musical|Romance
686,904,Rear Window (1954),Mystery|Thriller
694,912,Casablanca (1942),Drama|Romance
841,1104,"Streetcar Named Desire, A (1951)",Drama
903,1201,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western
906,1204,Lawrence of Arabia (1962),Adventure|Drama|War
922,1221,"Godfather: Part II, The (1974)",Crime|Drama


In [11]:
def cold_start_recommend(n=10):
    popular_movies = (
        ratings.groupby('movieId')
        .size()
        .sort_values(ascending=False)
        .head(n)
        .index
    )
    return movies[movies['movieId'].isin(popular_movies)]

cold_start_recommend(10)


Unnamed: 0,movieId,title,genres
97,110,Braveheart (1995),Action|Drama|War
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
461,527,Schindler's List (1993),Drama|War
507,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller


In [12]:
## Final Conclusions

# - We built recommender systems from simple baselines to hybrid models
# - Surprise SVD achieved the best overall accuracy
# - Cold-start remains a fundamental challenge
# - Hybrid systems improve robustness in real-world scenarios
# - The final system can generate real recommendations for users
