In [7]:
from google.colab import files
uploaded = files.upload()
# Upload the extracted 'ratings.csv' and 'movies.csv' files


Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv


In [8]:
import pandas as pd

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

print(ratings.head())
print(movies.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [12]:
from sklearn.model_selection import train_test_split

train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

print(f"Train samples: {len(train_ratings)}, Test samples: {len(test_ratings)}")


Train samples: 80668, Test samples: 20168


In [9]:
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print('User-Movie matrix shape:', user_movie_matrix.shape)


User-Movie matrix shape: (610, 9724)


In [16]:
train_user_movie = train_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(train_user_movie)
user_ids = train_user_movie.index.tolist()
user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}



In [17]:
import numpy as np

def recommend_movies(user_id, user_movie_matrix, user_similarity, movies, top_n=10):
    user_idx = user_id - 1  # assuming user IDs start at 1 consecutively
    sim_scores = user_similarity[user_idx]

    weighted_ratings = user_movie_matrix.T.dot(sim_scores)
    norm_factors = np.array([np.abs(sim_scores).sum()])
    predicted_ratings = weighted_ratings / norm_factors

    # Exclude movies already rated by user
    user_ratings = user_movie_matrix.iloc[user_idx]
    predicted_ratings[user_ratings > 0] = 0

    # Get top N movie indices with highest predicted ratings
    top_indices = predicted_ratings.argsort()[::-1][:top_n]
    recommended_movie_ids = predicted_ratings.index[top_indices]

    recommended_titles = movies[movies['movieId'].isin(recommended_movie_ids)][['movieId', 'title']]

    return recommended_titles

# Example usage: Recommend 10 movies for user 1
recommendations = recommend_movies(1, user_movie_matrix, user_similarity, movies)
print(recommendations)


      movieId                                              title
31         32          Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
123       150                                   Apollo 13 (1995)
277       318                   Shawshank Redemption, The (1994)
506       588                                     Aladdin (1992)
507       589                  Terminator 2: Judgment Day (1991)
659       858                              Godfather, The (1972)
2078     2762                            Sixth Sense, The (1999)
3638     4993  Lord of the Rings: The Fellowship of the Ring,...
4137     5952      Lord of the Rings: The Two Towers, The (2002)
4800     7153  Lord of the Rings: The Return of the King, The...


In [20]:
def predict_rating(user_id, movie_id, user_movie_matrix, user_similarity, user_id_to_idx):
    if user_id not in user_id_to_idx:
        return np.nan
    user_idx = user_id_to_idx[user_id]

    if movie_id not in user_movie_matrix.columns:
        return np.nan

    sim_scores = user_similarity[user_idx]
    movie_ratings = user_movie_matrix[movie_id]
    weighted_ratings = np.dot(sim_scores, movie_ratings)
    sum_sims = np.sum(np.abs(sim_scores))

    if sum_sims == 0:
        return np.nan

    return weighted_ratings / sum_sims


In [22]:
import numpy as np

test_ratings = test_ratings[test_ratings['movieId'].isin(train_user_movie.columns)]

predictions = []
truths = []

for _, row in test_ratings.iterrows():
    pred = predict_rating(row['userId'], row['movieId'], train_user_movie, user_similarity, user_id_to_idx)
    if not np.isnan(pred):
        predictions.append(pred)
        truths.append(row['rating'])


from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(truths, predictions)
mae = mean_absolute_error(truths, predictions)
print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")


Test MSE: 10.4012
Test MAE: 3.0590
