In [2]:
import pandas as pd
import os
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split


In [3]:
# Path to your data
file_path = "../data/ratings.csv"

# Tell Surprise how to read it
reader = Reader(line_format="user item rating timestamp", sep=",", skip_lines=1, rating_scale=(0.5, 5.0))

# Load the dataset from your CSV
data = Dataset.load_from_file(file_path, reader=reader)

trainset, testset = train_test_split(data, test_size=0.25)
print(f"Train size: {trainset.n_ratings}, Test size: {len(testset)}")




Train size: 75627, Test size: 25209


In [4]:
from surprise import SVD
from surprise import accuracy

# Create the algorithm
model = SVD()

# Train the model
model.fit(trainset)

# Test it
predictions = model.test(testset)

# Evaluate accuracy
accuracy.rmse(predictions)


RMSE: 0.8715


0.8715320706822531

In [5]:
# Get all item IDs from the training set
all_movies = trainset.all_items()

# Convert Surprise’s internal IDs back to raw movie IDs
all_movie_ids = [trainset.to_raw_iid(inner_id) for inner_id in all_movies]

# Pick a target user
target_user = "1"

# Get the user’s rated movies from the training set
user_rated = [item for (item, _) in trainset.ur[trainset.to_inner_uid(target_user)]]
user_rated_ids = [trainset.to_raw_iid(inner_id) for inner_id in user_rated]

# Find unseen movies
unseen_movies = [mid for mid in all_movie_ids if mid not in user_rated_ids]

# Predict ratings
predictions = [model.predict(target_user, mid) for mid in unseen_movies]

# Sort predictions by estimated rating
predictions.sort(key=lambda x: x.est, reverse=True)

# Choose top 10
top_n = predictions[:10]

# Display
for pred in top_n:
    print(f"Movie ID: {pred.iid} | Predicted rating: {round(pred.est, 2)}")


Movie ID: 318 | Predicted rating: 5.0
Movie ID: 356 | Predicted rating: 5.0
Movie ID: 4011 | Predicted rating: 5.0
Movie ID: 58559 | Predicted rating: 5.0
Movie ID: 778 | Predicted rating: 5.0
Movie ID: 3083 | Predicted rating: 5.0
Movie ID: 1193 | Predicted rating: 5.0
Movie ID: 2324 | Predicted rating: 5.0
Movie ID: 1276 | Predicted rating: 5.0
Movie ID: 1104 | Predicted rating: 5.0


In [6]:
import pandas as pd

movies_df = pd.read_csv("../data/movies.csv")
movie_map = dict(zip(movies_df["movieId"].astype(str), movies_df["title"]))

# Display top recommendations with titles
for pred in top_n:
    title = movie_map.get(pred.iid, "Unknown")
    print(f"{title} (Predicted: {round(pred.est, 2)})")


Shawshank Redemption, The (1994) (Predicted: 5.0)
Forrest Gump (1994) (Predicted: 5.0)
Snatch (2000) (Predicted: 5.0)
Dark Knight, The (2008) (Predicted: 5.0)
Trainspotting (1996) (Predicted: 5.0)
All About My Mother (Todo sobre mi madre) (1999) (Predicted: 5.0)
One Flew Over the Cuckoo's Nest (1975) (Predicted: 5.0)
Life Is Beautiful (La Vita è bella) (1997) (Predicted: 5.0)
Cool Hand Luke (1967) (Predicted: 5.0)
Streetcar Named Desire, A (1951) (Predicted: 5.0)


In [7]:
import os

print(os.getcwd())  # Shows your current folder
print(os.listdir())  # Lists all files


C:\Users\andre\Documents\Movie-Recommender\notebooks
['.ipynb_checkpoints', '01_data_something.ipynb', '02_train_model.ipynb']


In [8]:
data_path = os.path.join("data", "ml-latest-small")
movies = pd.read_csv("../data/movies.csv")
ratings = pd.read_csv("../data/ratings.csv")


print("🎥 Movies:")
display(movies.head())

print("⭐ Ratings:")
display(ratings.head())



🎥 Movies:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


⭐ Ratings:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
# Check for missing values
print(movies.isnull().sum())

# Check for duplicates
print("Duplicate movieIds:", movies['movieId'].duplicated().sum())


movieId    0
title      0
genres     0
dtype: int64
Duplicate movieIds: 0


In [10]:
# Check first few rows
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
# Check for missing values
print(ratings.isnull().sum())

# Check rating range
print("Min rating:", ratings['rating'].min())
print("Max rating:", ratings['rating'].max())

# Check for duplicates
duplicates = ratings.duplicated(subset=['userId','movieId'])
print("Duplicate ratings:", duplicates.sum())


userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Min rating: 0.5
Max rating: 5.0
Duplicate ratings: 0


In [12]:
ratings = ratings.drop_duplicates(subset=['userId','movieId'])

num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()
print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")


Number of users: 610
Number of movies: 9724


In [13]:
df = ratings.merge(movies, on='movieId', how='left')
df.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [14]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd


In [15]:
# Define the rating scale (MovieLens uses 0.5 to 5.0)
reader = Reader(rating_scale=(0.5, 5.0))

# Load the data from the ratings DataFrame
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)


In [16]:
# Initialize the model
model = SVD()

# Train it on the training set
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1712deb1f10>

In [17]:
from surprise import accuracy

# Generate predictions
predictions = model.test(testset)

# Calculate RMSE (Root Mean Squared Error)
accuracy.rmse(predictions)


RMSE: 0.8709


0.8709083097341981

In [18]:
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8781  0.8644  0.8812  0.8725  0.8717  0.8736  0.0058  
MAE (testset)     0.6740  0.6644  0.6764  0.6674  0.6724  0.6709  0.0044  
Fit time          0.81    0.83    0.84    0.84    0.85    0.83    0.01    
Test time         0.09    0.17    0.10    0.18    0.09    0.13    0.04    


{'test_rmse': array([0.87812292, 0.86437853, 0.88118416, 0.87248884, 0.87171871]),
 'test_mae': array([0.67399946, 0.66435174, 0.67641178, 0.66741755, 0.67235042]),
 'fit_time': (0.8085818290710449,
  0.8257112503051758,
  0.8426251411437988,
  0.839850664138794,
  0.8489053249359131),
 'test_time': (0.09475493431091309,
  0.17395377159118652,
  0.09573984146118164,
  0.17603826522827148,
  0.09477996826171875)}

In [19]:
user_id = 1
all_movie_ids = movies['movieId'].unique()
rated_movies = ratings[ratings['userId'] == user_id]['movieId']
unrated_movies = [m for m in all_movie_ids if m not in rated_movies.values]

predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]
predictions.sort(key=lambda x: x.est, reverse=True)
top_predictions = predictions[:10]

top_movie_ids = [pred.iid for pred in top_predictions]
recommended_movies = movies[movies['movieId'].isin(top_movie_ids)]

recommended_movies[['movieId', 'title']]

recommendations = pd.DataFrame([
    {"movieId": pred.iid, "predicted_rating": pred.est}
    for pred in top_predictions
]).merge(movies, on="movieId")[["title", "predicted_rating"]]

recommendations


Unnamed: 0,title,predicted_rating
0,"Shawshank Redemption, The (1994)",5.0
1,Dr. Strangelove or: How I Learned to Stop Worr...,5.0
2,"Streetcar Named Desire, A (1951)",5.0
3,12 Angry Men (1957),5.0
4,Lawrence of Arabia (1962),5.0
5,"Dark Knight, The (2008)",5.0
6,Guardians of the Galaxy (2014),5.0
7,Spotlight (2015),5.0
8,North by Northwest (1959),4.99127
9,Kiss Kiss Bang Bang (2005),4.98924


In [20]:
import numpy as np

# Get movie and user latent features
movie_factors = model.qi
user_factors = model.pu

movie_to_inner_id = {raw_id: inner_id for raw_id, inner_id in model.trainset._raw2inner_id_items.items()}
inner_id_to_movie = {inner_id: raw_id for raw_id, inner_id in movie_to_inner_id.items()}


In [21]:
from numpy.linalg import norm

def get_similar_movies(movie_title, n=10):
    # Find movieId for given title
    movie_id = movies[movies['title'].str.contains(movie_title, case=False, regex=False)]['movieId']
    if movie_id.empty:
        return f"No movie found with title containing '{movie_title}'."
    
    movie_id = movie_id.iloc[0]
    
    # Get inner ID
    inner_id = movie_to_inner_id.get(movie_id)
    if inner_id is None:
        return "Movie not in training set."
    
    # Get vector for this movie
    target_vector = movie_factors[inner_id]
    
    # Compute cosine similarity
    sims = movie_factors.dot(target_vector) / (norm(movie_factors, axis=1) * norm(target_vector))
    
    # Get top similar movies (excluding itself)
    similar_ids = np.argsort(sims)[::-1][1:n+1]
    similar_movie_ids = [inner_id_to_movie[i] for i in similar_ids]
    
    return movies[movies['movieId'].isin(similar_movie_ids)][['title']]


In [23]:
get_similar_movies("Matrix")


Unnamed: 0,title
97,Braveheart (1995)
613,Trainspotting (1996)
839,Top Gun (1986)
898,Star Wars: Episode V - The Empire Strikes Back...
899,"Princess Bride, The (1987)"
1824,You've Got Mail (1998)
3991,Red Dragon (2002)
4277,Salaam Bombay! (1988)
5917,Batman Begins (2005)
6299,"Science of Sleep, The (La science des rêves) (..."


In [26]:
import pickle

with open("svd_model.pkl", "wb") as f:
    pickle.dump(model, f)
