In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
import numpy as np

In [2]:
# Load movie ratings data
ratings_df = pd.read_csv("~/Desktop/MovieLens-resources/ratings.csv")
ratings_df = ratings_df.drop('timestamp', axis=1)


In [3]:
# Load movies data
movies_df = pd.read_csv("~/Desktop/MovieLens-resources/movies.csv")


In [4]:
# Create a Surprise Reader and Dataset
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)


In [5]:
# Split the dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [6]:
# Train the SVD model
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2197ff505d0>

In [11]:
# Make predictions on the test set
predictions = svd_model.test(testset)

In [13]:
# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

RMSE: 0.7862
Root Mean Squared Error (RMSE): 0.7861738221464648


In [14]:
# Function to get similar movies for a given movie title
def get_similar_movies(movie_title, model, movies_df, n=5):
    # Get the movieId for the input movie title
    movie_id = movies_df[movies_df['title'] == movie_title]['movieId'].iloc[0]
    
    # Get the latent factors for the input movie
    movie_factors = model.qi[movie_id]
    
    # Compute the cosine similarity between the input movie and all other movies
    similarities = np.dot(model.qi, movie_factors)
    
    # Get indices of top n most similar movies
    similar_movie_indices = np.argsort(similarities)[::-1][:n+1]
    
    # Exclude the input movie itself
    similar_movie_indices = similar_movie_indices[similar_movie_indices != movie_id]
    
    # Get the titles of similar movies
    similar_movies = movies_df[movies_df['movieId'].isin(similar_movie_indices)]['title'].tolist()
    
    return similar_movies


In [19]:
# Example: Enter a movie and get similar movies
input_movie = "Incredibles 2 (2018)"
similar_movies = get_similar_movies(input_movie, svd_model, movies_df, n=5)



IndexError: index 187541 is out of bounds for axis 0 with size 79146

In [None]:
# Display the results
print(f"Movies similar to '{input_movie}':")
for i, movie in enumerate(similar_movies, start=1):
    print(f"{i}. {movie}")