In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [6]:
# Load ratings and movies dataset (preprocessed)
ratings = pd.read_csv("data/ml-100k/u.data", sep="\t", names=["userId", "movieId", "rating", "timestamp"])
movies = pd.read_csv("data/ml-100k/u.item", sep="|", encoding='latin-1', names=[
    "movieId", "title", "release_date", "video_release_date", "IMDb_URL", "unknown",
    "Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama",
    "Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller",
    "War","Western"
], usecols=range(24))

# Check the first few rows
print(ratings.head())
print(movies.head())

   userId  movieId  rating  timestamp
0     196      242       3  881250949
1     186      302       3  891717742
2      22      377       1  878887116
3     244       51       2  880606923
4     166      346       1  886397596
   movieId              title release_date  video_release_date  \
0        1   Toy Story (1995)  01-Jan-1995                 NaN   
1        2   GoldenEye (1995)  01-Jan-1995                 NaN   
2        3  Four Rooms (1995)  01-Jan-1995                 NaN   
3        4  Get Shorty (1995)  01-Jan-1995                 NaN   
4        5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://

In [7]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
print(f"Train dataset: {train_data.shape}, Test dataset: {test_data.shape}")

Train dataset: (80000, 4), Test dataset: (20000, 4)


In [8]:
# Pivot table: rows = users, columns = movies, values = ratings
train_matrix = train_data.pivot(index='userId', columns='movieId', values='rating')
test_matrix = test_data.pivot(index='userId', columns='movieId', values='rating')

print(train_matrix.head())

movieId  1     2     3     4     5     6     7     8     9     10    ...  \
userId                                                               ...   
1         NaN   3.0   4.0   NaN   3.0   NaN   4.0   NaN   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

movieId  1668  1670  1671  1672  1673  1676  1678  1679  1680  1681  
userId                                                               
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   N

In [9]:
# Calculate mean rating per user
user_mean = train_matrix.mean(axis=1)

# Normalize by subtracting mean
train_matrix_norm = train_matrix.sub(user_mean, axis=0)

# Fill NaNs with 0
train_matrix_norm.fillna(0, inplace=True)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between users
user_similarity = cosine_similarity(train_matrix_norm)
user_similarity = pd.DataFrame(user_similarity, index=train_matrix_norm.index, columns=train_matrix_norm.index)

print(user_similarity.head())

userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.026776 -0.009507  0.071335  0.060719  0.052215  0.095372   
2       0.026776  1.000000  0.001526 -0.045814  0.039236  0.060250  0.098236   
3      -0.009507  0.001526  1.000000 -0.077239  0.000000 -0.031638  0.000118   
4       0.071335 -0.045814 -0.077239  1.000000 -0.003889 -0.046988 -0.024805   
5       0.060719  0.039236  0.000000 -0.003889  1.000000  0.027890  0.042048   

userId       8         9         10   ...       934       935       936  \
userId                                ...                                 
1       0.140417  0.006843 -0.041563  ...  0.011441 -0.010146  0.051570   
2       0.036941  0.052222  0.004292  ...  0.013064 -0.028013  0.031949   
3       0.013674 -0.004630  0.004394  ... -0.001830 -0.002717 -0.024492   
4       0.037080 -0.002601 -0.013575  ... -0.007540  0.004197 -0

In [11]:
# Predict ratings
def predict_ratings(user_sim, ratings):
    pred = user_sim.dot(ratings) / np.abs(user_sim).sum(axis=1).reshape(-1,1)
    return pred

pred_ratings = predict_ratings(user_similarity.values, train_matrix_norm.values)
pred_ratings_df = pd.DataFrame(pred_ratings, index=train_matrix.index, columns=train_matrix.columns)
print(pred_ratings_df.head())

movieId      1         2         3         4         5         6         7     \
userId                                                                          
1        0.113153 -0.052745 -0.018020  0.026617 -0.051129 -0.005442  0.169771   
2        0.047761 -0.025872 -0.028880 -0.001607 -0.017611  0.004798  0.060259   
3        0.005411  0.006962 -0.006218  0.011856 -0.020432  0.007020 -0.002451   
4       -0.032796  0.013490 -0.007832 -0.022128  0.003532 -0.005645 -0.035226   
5        0.218749 -0.038674 -0.030846  0.019017 -0.015649  0.002221  0.170230   

movieId      8         9         10    ...      1668      1670      1671  \
userId                                 ...                                 
1        0.083099  0.182606  0.018347  ...  0.000191  0.000191 -0.001682   
2        0.025508  0.076094  0.003815  ...  0.000204  0.000204 -0.001239   
3        0.014165  0.052800  0.011554  ... -0.000147 -0.000147 -0.002594   
4        0.005020  0.003465  0.017940  ... -0.000352

In [13]:
# Keep only common users and movies
common_users = pred_ratings_df.index.intersection(test_matrix.index)
common_movies = pred_ratings_df.columns.intersection(test_matrix.columns)

pred_ratings_test = pred_ratings_df.loc[common_users, common_movies].fillna(0)
test_matrix_filled = test_matrix.loc[common_users, common_movies].fillna(0)

rmse = sqrt(mean_squared_error(test_matrix_filled.values.flatten(), pred_ratings_test.values.flatten()))
print(f"RMSE: {rmse}")


RMSE: 0.4569524939157551


In [14]:
# Get predicted ratings for a user
user_id = 1
user_pred = pred_ratings_df.loc[user_id].sort_values(ascending=False)

# Show top 10 recommendations
top_10_movies = user_pred.head(10)
print("Top 10 recommended movies for user 1:")
print(top_10_movies)

Top 10 recommended movies for user 1:
movieId
50     0.564475
174    0.392052
127    0.387986
172    0.348122
12     0.331018
173    0.325199
100    0.311497
168    0.295959
98     0.289811
56     0.286030
Name: 1, dtype: float64


In [15]:
top_10_ids = top_10_movies.index
top_10_titles = movies[movies['movieId'].isin(top_10_ids)][['movieId', 'title']]

# Merge predicted ratings with movie titles
top_10_with_titles = top_10_titles.merge(
    top_10_movies.rename("predicted_rating"),
    left_on="movieId",
    right_index=True
).sort_values(by="predicted_rating", ascending=False)

print(top_10_with_titles)


     movieId                                   title  predicted_rating
49        50                        Star Wars (1977)          0.564475
173      174          Raiders of the Lost Ark (1981)          0.392052
126      127                   Godfather, The (1972)          0.387986
171      172         Empire Strikes Back, The (1980)          0.348122
11        12              Usual Suspects, The (1995)          0.331018
172      173              Princess Bride, The (1987)          0.325199
99       100                            Fargo (1996)          0.311497
167      168  Monty Python and the Holy Grail (1974)          0.295959
97        98        Silence of the Lambs, The (1991)          0.289811
55        56                     Pulp Fiction (1994)          0.286030


In [17]:
from pathlib import Path

# Ensure the directory exists
Path("ml").mkdir(parents=True, exist_ok=True)

# Now save the CSV
top_10_with_titles.to_csv("ml/top_10_recommendations_user1.csv", index=False)


In [18]:
top_10_with_titles.to_csv("ml/top_10_recommendations_user1.csv", index=False)