In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# Step 1: Load dataset from MovieLens 100k
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])


df.drop('timestamp', axis=1, inplace=True)


df['userId'] = df['userId'].astype('category')
df['movieId'] = df['movieId'].astype('category')

user_id_map = dict(enumerate(df['userId'].cat.categories))
movie_id_map = dict(enumerate(df['movieId'].cat.categories))

df['user_index'] = df['userId'].cat.codes
df['movie_index'] = df['movieId'].cat.codes

n_users = df['user_index'].nunique()
n_movies = df['movie_index'].nunique()

# Step 2: Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_df[['user_index', 'movie_index', 'rating']].values
test_data = test_df[['user_index', 'movie_index', 'rating']].values


K = 10  # latent factors
P = np.random.normal(scale=1./K, size=(n_users, K))
Q = np.random.normal(scale=1./K, size=(n_movies, K))

# Step 3: Matrix Factorization via SGD
def train_mf(train_data, P, Q, K, steps=100, alpha=0.01, beta=0.01):
    Q = Q.T
    for step in range(steps):
        total_error = 0
        for i, j, r in train_data:
            prediction = np.dot(P[i], Q[:, j])
            error = r - prediction
            total_error += error**2

            P[i] += alpha * (error * Q[:, j] - beta * P[i])
            Q[:, j] += alpha * (error * P[i] - beta * Q[:, j])

    return P, Q.T

# Train model
P_trained, Q_trained = train_mf(train_data, P, Q, K)

#  Build full prediction matrix
predicted_ratings = np.dot(P_trained, Q_trained.T)

#  Evaluate on test set
true = []
pred = []

for i, j, r in test_data:
    true.append(r)
    pred.append(predicted_ratings[i, j])

rmse = np.sqrt(mean_squared_error(true, pred))

mae = mean_absolute_error(true, pred)

print("\n Evaluation Metrics on Test Set:")
print(f"mean_squared_error: {rmse:.4f}")
print(f" mean_absolute_error : {mae:.4f}")

# Recommend top N movies to a user
def recommend_top_n(user_id, N=5):
    user_index = df[df['userId'] == user_id].iloc[0]['user_index']
    user_ratings = predicted_ratings[int(user_index)]

    # Mask already rated movies
    rated_movies = train_df[train_df['user_index'] == user_index]['movie_index'].values
    user_ratings[rated_movies] = -np.inf  # ignore seen

    # Top N recommendations
    top_indices = np.argsort(user_ratings)[-N:][::-1]
    movie_ids = [movie_id_map[i] for i in top_indices]

    # Load movie titles
    movie_info_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
    movie_info = pd.read_csv(movie_info_url, sep='|', encoding='latin-1', header=None, usecols=[0, 1], names=['movieId', 'title'])

    top_movies = movie_info[movie_info['movieId'].isin(movie_ids)]
    print(f"\nTop {N} movie recommendations for user {user_id}:\n")
    print(top_movies)

# Recommend for user
recommend_top_n(user_id=10, N=5)



 Evaluation Metrics on Test Set:
mean_squared_error: 1.0276
 mean_absolute_error : 0.7887

Top 5 movie recommendations for user 10:

      movieId                                              title
126       127                              Godfather, The (1972)
317       318                            Schindler's List (1993)
319       320  Paradise Lost: The Child Murders at Robin Hood...
1168     1169                                       Fresh (1994)
1448     1449                             Pather Panchali (1955)
