In [2]:
#Importing the required libraries
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
import pickle

In [3]:
#Loading the required dataset and reading it
ratings = pd.read_csv("movie_data.csv")
movies = pd.read_csv("Movie_Id_Titles.csv")

print(ratings.head())
print(movies.head())


   user_id  item_id  rating  timestamp
0        0       50       5  881250949
1        0      172       5  881250949
2        0      133       1  881250949
3      196      242       3  881250949
4      186      302       3  891717742
   item_id              title
0        1   Toy Story (1995)
1        2   GoldenEye (1995)
2        3  Four Rooms (1995)
3        4  Get Shorty (1995)
4        5     Copycat (1995)


In [5]:
#Merge the columns of the dataset
data = pd.merge(ratings, movies, on="item_id")

In [8]:
reader = Reader(rating_scale=(0.5, 5.0))  # Set based on actual rating scale
surprise_data = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], reader)

In [9]:
from surprise.model_selection import train_test_split
from surprise import SVD

trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7adbf8c5eb10>

In [10]:
from surprise.model_selection import cross_validate

results = cross_validate(model, surprise_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9365  0.9328  0.9356  0.9309  0.9365  0.9344  0.0022  
MAE (testset)     0.7399  0.7346  0.7375  0.7340  0.7373  0.7367  0.0021  
Fit time          1.86    1.42    1.44    1.42    1.97    1.62    0.24    
Test time         0.11    0.13    0.11    0.23    0.17    0.15    0.05    


In [11]:
prediction = model.predict(uid=1, iid=31)  # Example: user 1, movieId 31
print(prediction.est)


3.714357107702041


In [13]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort and get top N
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Predict for all pairs
testset_all = trainset.build_anti_testset()
predictions = model.test(testset_all)
top_n = get_top_n(predictions, n=10)

# Show top 10 for user 1
user_1_top = top_n[1]
print("Top 10 recommendations for User 1:")
for movie_id, score in user_1_top:
    title = movies[movies['item_id'] == int(movie_id)]['title'].values[0]
    print(f"{title} (Predicted rating: {score:.2f})")

Top 10 recommendations for User 1:
Schindler's List (1993) (Predicted rating: 4.81)
Henry V (1989) (Predicted rating: 4.72)
Casablanca (1942) (Predicted rating: 4.70)
Godfather, The (1972) (Predicted rating: 4.64)
Amadeus (1984) (Predicted rating: 4.64)
Return of the Jedi (1983) (Predicted rating: 4.61)
Glory (1989) (Predicted rating: 4.58)
Fargo (1996) (Predicted rating: 4.56)
Some Folks Call It a Sling Blade (1993) (Predicted rating: 4.56)
Close Shave, A (1995) (Predicted rating: 4.56)


In [14]:
import pickle

with open("svd_model.pkl", "wb") as f:
    pickle.dump(model, f)
