In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
# Step 1: Load the dataset
file_path = r'./movieRating.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,TrainDataID,UserID,MovieID,Rating
0,1,796,1193,5
1,2,796,661,3
2,3,796,914,3
3,4,796,3408,4
4,5,796,2355,5


In [2]:
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dot, Add, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# Step 1: Prepare the data for Keras
n_users = data['UserID'].nunique()
n_movies = data['MovieID'].nunique()

In [3]:
# Step 2: Split the data into training (80%) and testing (20%) sets randomly
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Confirm the size of each split
len(train_data), len(test_data)

(719898, 179975)

In [4]:
# Normalize UserID and MovieID to start from 0 for embedding layers
user_id_map = {id_: i for i, id_ in enumerate(data['UserID'].unique())}
movie_id_map = {id_: i for i, id_ in enumerate(data['MovieID'].unique())}


data['UserID'] = data['UserID'].map(user_id_map)
data['MovieID'] = data['MovieID'].map(movie_id_map)

train_data['UserID'] = train_data['UserID'].map(user_id_map)
train_data['MovieID'] = train_data['MovieID'].map(movie_id_map)
test_data['UserID'] = test_data['UserID'].map(user_id_map)
test_data['MovieID'] = test_data['MovieID'].map(movie_id_map)


In [5]:
test_data

Unnamed: 0,TrainDataID,UserID,MovieID,Rating
805081,805082,5404,852,4
408898,408899,2795,727,2
234904,234905,1596,688,5
314846,314847,2055,1191,5
74881,74882,548,29,1
...,...,...,...,...
426494,426495,2908,341,5
343269,343270,2230,130,3
96268,96269,703,1283,3
412626,412627,2823,938,4


In [6]:
# Input data for the model
train_users = train_data['UserID'].values
train_movies = train_data['MovieID'].values
train_ratings = train_data['Rating'].values

test_users = test_data['UserID'].values
test_movies = test_data['MovieID'].values
test_ratings = test_data['Rating'].values

In [7]:
train_users

array([  34, 2568, 4945, ...,  946, 4451,  876], dtype=int64)

In [8]:
# Step 2: Define the Keras Model
embedding_size = 50

# User embedding
user_input = Input(shape=(1,))
user_embedding = Embedding(input_dim=n_users, output_dim=embedding_size)(user_input)
user_vector = Flatten()(user_embedding)

In [9]:
user_vector

<KerasTensor shape=(None, 50), dtype=float32, sparse=False, name=keras_tensor_2>

In [10]:
# Movie embedding
movie_input = Input(shape=(1,))
movie_embedding = Embedding(input_dim=n_movies, output_dim=embedding_size)(movie_input)
movie_vector = Flatten()(movie_embedding)
movie_vector

<KerasTensor shape=(None, 50), dtype=float32, sparse=False, name=keras_tensor_5>

In [11]:
# Dot product of user and movie embeddings
dot_product = Dot(axes=1)([user_vector, movie_vector])
dot_product

<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=keras_tensor_6>

In [12]:
# Add biases
user_bias = Embedding(input_dim=n_users, output_dim=1)(user_input)
movie_bias = Embedding(input_dim=n_movies, output_dim=1)(movie_input)
user_bias = Flatten()(user_bias)
movie_bias = Flatten()(movie_bias)
rating_prediction = Add()([dot_product, user_bias, movie_bias])
rating_prediction

<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=keras_tensor_11>

In [13]:
# Define the model
model = Model(inputs=[user_input, movie_input], outputs=rating_prediction)
model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))
model

<Functional name=functional, built=True>

In [14]:
# Step 3: Train the Model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
    x=[train_users, train_movies],
    y=train_ratings,
    batch_size=64,
    epochs=20,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

history

Epoch 1/20




[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - loss: 7.0395 - val_loss: 0.9162
Epoch 2/20
[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - loss: 0.8700 - val_loss: 0.8474
Epoch 3/20
[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - loss: 0.7801 - val_loss: 0.8166
Epoch 4/20
[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - loss: 0.7005 - val_loss: 0.8066
Epoch 5/20
[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - loss: 0.6164 - val_loss: 0.8126
Epoch 6/20
[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - loss: 0.5359 - val_loss: 0.8364
Epoch 7/20
[1m8999/8999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - loss: 0.4686 - val_loss: 0.8687


<keras.src.callbacks.history.History at 0x1a33a0d15e0>

In [15]:
# Step 4: Predict and Calculate MAE
predicted_ratings = model.predict([test_users, test_movies]).flatten()
mae = mean_absolute_error(test_ratings, predicted_ratings)

mae

[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 837us/step


0.7046368865690094