# This file colntains the code to run **SVD** based movie recommendation model trained on the netflix prize dataset.

In [None]:
# Import the required libraries for reading the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Reading the dataset that contains 15% of the original dataset
ratings_df = pd.read_csv('/kaggle/input/final-dataset/final_dataset.csv', encoding = "ISO-8859-1", header=0)

# Looking at first 10 rows of data
print (ratings_df.head(10))

In [None]:
# Extracting the required columns into the dataset dataframe
dataset = ratings_df[['CustomerID', 'Rating', 'MovieID']]
dataset.head()

The following code is needed to correctly process the movie headers and merge them into a single column. The problem is occuring because the headers are a part of a csv file and the titles also have commas in them. Resolving this in the following code.

In [None]:
column_names = ['Movie_Id', 'YearOfRelease', 'Title', 'extra_col1', 'extra_col2', 'extracol3']

df_title = pd.read_csv('/kaggle/input/netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = column_names)

# Combine the last n columns into a new 'names' column
df_title['names'] = df_title.iloc[:, -4:].astype(str).apply(lambda row: ' '.join(row), axis=1)

# Drop the last n columns if needed
df_title = df_title.iloc[:, :-4]

In [None]:
df_title.head()

In [None]:
# Installing the required scikit-surprise library
!pip install scikit-surprise

In [None]:
# Import required libraries to train and test the model
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

The following is the **5-fold** and **10-fold** cross validation of the SVD model

In [None]:
reader = Reader()

# Loading the required data into the data variable
data = Dataset.load_from_df(dataset[['CustomerID', 'MovieID', 'Rating']], reader)

# Creating an instance of the SVD algorithm
svd = SVD()

# Running the algoritghm for 5-fold cross validation.
result5 = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# Getting the RMSE values for 5-fold cross validation
rmse_scores = result5['test_rmse']

# Plotting the RMSE values
plt.figure(figsize=(10, 8))
plt.plot(range(1, len(rmse_scores) + 1), rmse_scores, marker='o', linestyle='-')
plt.title('RMSE Values across Folds for 5 CV')
plt.xlabel('Fold')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()

In [None]:
# Running the above code for 10-fold cross validation
reader = Reader()

# Loading the required data into the data variable
data = Dataset.load_from_df(dataset[['CustomerID', 'MovieID', 'Rating']], reader)

# Creating an instance of the SVD algorithm
svd = SVD()

# Running the algoritghm for 10-fold cross validation.
result10 = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

In [None]:
rmse_scores = result10['test_rmse']

# Plot the RMSE values
plt.figure(figsize=(10, 8))
plt.plot(range(1, len(rmse_scores) + 1), rmse_scores, marker='o', linestyle='-')
plt.title('RMSE Values across Folds for 10 CV')
plt.xlabel('Fold')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()

The following is the code to train the model and generate recommendations for any given user

In [None]:
# Initialize a Reader object with the rating scale to be from 1 to 5
reader = Reader(rating_scale=(1, 5))

# Loading the required data into the data variable
data = Dataset.load_from_df(dataset[['CustomerID', 'MovieID', 'Rating']], reader)

svd = SVD()

# Now that we are done verifying the working of the model, we can train the prediction model on the entire dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

# Function to get unrated movies for a user
def get_user_id_unrated(user_id):
    # Get all movie IDs
    all_movie_ids = dataset['MovieID'].unique()

    # Get movies rated by the user
    movies_rated_by_user = dataset[dataset['CustomerID'] == user_id]['MovieID'].tolist()

    # Filter movies that the user hasn't rated yet
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in movies_rated_by_user]

    return unrated_movies

In [None]:
# Function to get movie names from movie IDs
def get_movie_names_from_id(movie_ids):
    movie_names = df_title.loc[df_title['Movie_Id'].isin(movie_ids), 'Title'].tolist()
    return movie_names

# user_id is the user for whom we want to recommend the movies
user_id = 71 

# Since we dont want a movie that is already rated by the user to be recommended to them,
# We get a list of movies not rated by them
unrated_movies = get_user_id_unrated(user_id)

# Using SVD model to predict the ratings of all the unrated movies
predicted_ratings = [svd.predict(user_id, movie_id) for movie_id in unrated_movies]

# The one user might like the most will be rated the highest in the prediction
predicted_ratings.sort(key=lambda x: x.est, reverse=True)

# Get the top 10 movie recommendations
top_n = 10 
recommended_movie_ids = [pred.iid for pred in predicted_ratings[:top_n]]


# Get movie names corresponding to recommended movie IDs
recommended_movie = get_movie_names_from_id(recommended_movie_ids)

print(f"Top 10 movie recommendations for user {user_id}:")
print(recommended_movie)

In [None]:
# Checking is any of the recommended movies has already been rated by the user
user_id = 71

# Filter the dataframe to get all the MovieIDs with the given customerID
customer_movies = dataset[dataset['CustomerID'] == user_id]['MovieID']

if not dataset[(dataset['CustomerID'] == user_id) & (dataset['MovieID'].isin(recommended_movie_ids))].empty:
    print("Some of the MovieIDs are already present for the given customerID.")
    print(dataset[(dataset['CustomerID'] == user_id) & (dataset['MovieID'].isin(recommended_movie_ids))])
else:
    print("None of the above recommened movies are rated by the given customer.")