# Install necessary packages

In [2]:
!pip install pandas



# Import libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the dataset
### 1. Download the MovieLens dataset from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
### 2. Unzip it and use the 'ratings.csv' file.

In [4]:
import requests
import zipfile

url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
filename = "ml-latest-small.zip"

# Download the file
response = requests.get(url)
with open(filename, "wb") as f:
    f.write(response.content)

# Unzip the file
with zipfile.ZipFile(filename, "r") as zip_ref:
    zip_ref.extractall()

# Now read the CSV file
df = pd.read_csv('ml-latest-small/ratings.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Prepare the utility matrix (User-Item matrix)

In [5]:
# Create a matrix where rows are users and columns are movies, and the values are ratings
utility_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print("Utility Matrix Shape:", utility_matrix.shape)

# Convert to numpy array
R = utility_matrix.to_numpy()

Utility Matrix Shape: (610, 9724)


# Initialize matrices

In [6]:
num_users, num_items = R.shape
num_features = 50  # Number of latent features

# Randomly initialize user and item matrices
P = np.random.normal(scale=1./num_features, size=(num_users, num_features))
Q = np.random.normal(scale=1./num_features, size=(num_items, num_features))

# Apply Regularized Matrix Factorization

In [7]:
alpha = 0.01  # Learning rate
beta = 0.01   # Regularization term
epochs = 1000  # Number of iterations

# Train the model
for epoch in range(epochs):
    for i in range(num_users):
        for j in range(num_items):
            if R[i, j] > 0:  # Only consider observed ratings
                # Calculate the prediction and error
                prediction = np.dot(P[i, :], Q[j, :].T)
                error = R[i, j] - prediction

                # Update the user and item matrices with regularization
                P[i, :] += alpha * (error * Q[j, :] - beta * P[i, :])
                Q[j, :] += alpha * (error * P[i, :] - beta * Q[j, :])

    # Compute the total loss (cost function) to track progress
    loss = 0
    for i in range(num_users):
        for j in range(num_items):
            if R[i, j] > 0:
                prediction = np.dot(P[i, :], Q[j, :].T)
                loss += (
                    (R[i, j] - prediction) ** 2 + \
                    beta * (
                        np.sum(P[i, :]**2) + \
                        np.sum(Q[j, :]**2)
                    )                               # Regularization term
                )
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {loss}")

Epoch 1/1000 - Loss: 743208.478997523
Epoch 2/1000 - Loss: 234697.00084469502
Epoch 3/1000 - Loss: 152287.52233934856
Epoch 4/1000 - Loss: 123719.12100415156
Epoch 5/1000 - Loss: 108525.66745531223
Epoch 6/1000 - Loss: 98647.66456200452
Epoch 7/1000 - Loss: 91378.12720810338
Epoch 8/1000 - Loss: 85630.63019150228
Epoch 9/1000 - Loss: 80869.63321000135
Epoch 10/1000 - Loss: 76711.38135046244
Epoch 11/1000 - Loss: 72887.8487787607
Epoch 12/1000 - Loss: 69273.5646775096
Epoch 13/1000 - Loss: 65835.2654089548
Epoch 14/1000 - Loss: 62574.28840319165
Epoch 15/1000 - Loss: 59496.04282760876
Epoch 16/1000 - Loss: 56600.9481138929
Epoch 17/1000 - Loss: 53886.05102818441
Epoch 18/1000 - Loss: 51348.178092576025
Epoch 19/1000 - Loss: 48984.78806147855
Epoch 20/1000 - Loss: 46792.91599464062
Epoch 21/1000 - Loss: 44767.858977098236
Epoch 22/1000 - Loss: 42902.6625367272
Epoch 23/1000 - Loss: 41188.45800734421
Epoch 24/1000 - Loss: 39615.18887010283
Epoch 25/1000 - Loss: 38172.31503180657
Epoch 26/

# Reconstruct the user-item matrix with reduced dimensions

In [8]:
R_pred = np.dot(P, Q.T)
R_pred_df = pd.DataFrame(R_pred, index=utility_matrix.index, columns=utility_matrix.columns)
R_pred_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.126551,3.657319,4.034163,2.377525,3.295169,3.754228,3.708151,3.854187,2.626967,2.651338,...,3.064907,2.642556,3.578738,3.514393,3.031073,3.562174,3.011349,3.081347,2.988919,3.266050
2,3.715765,2.795949,3.408168,1.895191,3.324955,4.137602,3.179154,2.896886,2.596070,4.127682,...,2.697633,2.330218,3.162701,3.093059,2.739588,3.111554,2.713494,2.699383,2.706353,2.800168
3,2.628177,3.299126,3.083266,-0.076593,0.502430,1.058396,1.000619,2.167538,1.090395,0.849724,...,1.579004,1.374563,1.741228,1.819398,1.506709,1.715514,1.508542,1.610076,1.521945,1.491270
4,3.163148,3.445514,1.637696,1.909919,2.863613,2.020551,2.654312,1.514096,3.209072,3.109881,...,2.025407,1.684198,2.322368,2.241366,1.988396,2.324156,1.997104,1.806910,2.002185,2.249732
5,3.996938,2.961748,3.597617,1.528730,2.661973,4.115115,2.787013,2.276930,1.924917,3.295095,...,2.198580,1.936481,2.653072,2.590200,2.288173,2.571387,2.248529,2.239240,2.153933,2.451902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.964003,2.887802,2.585974,1.986791,1.818341,3.037211,2.697948,2.621671,2.613156,2.734175,...,2.488878,2.069982,2.937818,2.891202,2.474394,2.921885,2.495474,2.510080,2.528390,2.689389
607,3.835873,4.273955,4.557245,2.636066,2.369145,2.926902,1.420923,2.957139,3.602219,2.594367,...,2.868092,2.484009,3.307033,3.317915,2.805140,3.181561,2.748136,2.831332,2.804954,2.988867
608,2.724317,2.065899,2.062984,2.033538,1.296116,4.164934,2.405907,2.212925,2.942418,3.894775,...,2.592366,2.211517,2.915962,2.879984,2.557223,2.841407,2.503903,2.600523,2.534893,2.981561
609,3.064412,2.734448,2.959157,2.170881,1.921438,3.501753,2.988056,2.477325,2.577496,3.947326,...,2.508783,2.137269,2.906186,2.837723,2.502637,2.861068,2.471885,2.510795,2.489918,2.655175


# Evaluate accuracy using Mean Squared Error (MSE)

In [9]:
# We will only calculate the MSE for the non-zero elements (actual ratings)
non_zero_indices = np.where(R != 0)
R_actual = R[non_zero_indices]
R_predicted = R_pred[non_zero_indices]

# Calculate the MSE
mse = mean_squared_error(R_actual, R_predicted)
print(f"Mean Squared Error (MSE) of the predicted ratings: {mse}")

# Calculate the RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) of the predicted ratings: {rmse}")

# Calculate the MAE
mae = mean_absolute_error(R_actual, R_predicted)
print(f"Mean Absolute Error (MAE) of the predicted ratings: {mae}")

Mean Squared Error (MSE) of the predicted ratings: 0.017626854067082868
Root Mean Squared Error (RMSE) of the predicted ratings: 0.13276616311049616
Mean Absolute Error (MAE) of the predicted ratings: 0.09018113876061078


# Recommend movies for a specific user

In [10]:
user_id = 1  # Example user ID
user_ratings = R_pred_df.loc[user_id].sort_values(ascending=False)

# Movies that the user has already rated
already_rated = df[df['userId'] == user_id]['movieId'].values

# Recommend top 5 movies that the user has not rated yet
recommendations = user_ratings.drop(already_rated).head(5)
print("Top 5 movie recommendations for User {}: ".format(user_id))
print(recommendations)

Top 5 movie recommendations for User 1: 
movieId
1221    6.592530
353     6.143069
7361    6.141533
2064    6.110568
2321    6.092081
Name: 1, dtype: float64
