# Import Libraries

In [39]:
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Input, Dense, concatenate
from tensorflow.keras.models import load_model

# Take a look at MovieLens (small) dataset

In [23]:
# # Load the data files
# ratings = pd.read_csv('Data/ratings.csv')
# movies = pd.read_csv('Data/movies.csv')
# links = pd.read_csv('Data/links.csv')
# tags = pd.read_csv('Data/tags.csv')

# # Display the first few rows of each data file
# print("Ratings Data:")
# print(ratings.head())

# print("\nMovies Data:")
# print(movies.head())

# print("\nLinks Data:")
# print(links.head())

# print("\nTags Data:")
# print(tags.head())

Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies Data:
   movieId                               title   
0        1                    Toy Story (1995)  \
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Links Data:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  3135

# User Recommender

## Import Data

In [18]:
# Load the data files
ratings = pd.read_csv('Data/ratings.csv')
movies = pd.read_csv('Data/movies.csv')

# Display the first few rows of each data file
print("Ratings Data:")
print(ratings.head())

print("\nMovies Data:")
print(movies.head())

Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies Data:
   movieId                               title   
0        1                    Toy Story (1995)  \
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


## Data Processing

In [21]:
ratings.shape , movies.shape

((100836, 4), (9742, 3))

In [8]:
ratings['userId'].nunique()

610

In [9]:
ratings['movieId'].nunique()

9724

In [19]:
movies['movieId'].nunique()

9742

In [17]:
print('Max Rating:' + str(ratings['rating'].min()) + ' and '+ ' Min Rating:' + str(ratings['rating'].max()))

Max Rating:0.5 and  Min Rating:5.0


In [25]:
# Encode user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings['userId'] = user_encoder.fit_transform(ratings['userId'])
ratings['movieId'] = movie_encoder.fit_transform(ratings['movieId'])

In [27]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

train_data.head(), test_data.head()

(       userId  movieId  rating   timestamp
 80568     508     4893     3.0  1435994597
 50582     325     7127     4.0  1322252335
 8344       56     1575     3.0   965798155
 99603     609      855     4.0  1479544102
 71701     461     1808     2.0  1174438249,
        userId  movieId  rating   timestamp
 67037     431     7316     4.5  1335139641
 42175     287      412     3.0   978465565
 93850     598     3217     3.0  1498524542
 6187       41     2248     4.0   996262677
 12229      74     1210     4.0  1158989841)

## Model Training

### Factorization Machine

In [30]:
# Define the Factorization Machine model
fm_model = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))

# Prepare the training data
X_train = train_data[['userId', 'movieId']]
y_train = train_data['rating']

# Train the model
fm_model.fit(X_train, y_train)

# Save the model
joblib.dump(fm_model, 'fm_model.pkl')

['fm_model.pkl']

### Neural Collaborative Filtering

In [32]:
# Define the NCF model
def NCF_model(num_users, num_movies, embedding_size=50):
    user_input = Input(shape=(1,), name='user_input')
    movie_input = Input(shape=(1,), name='movie_input')
    
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding')(user_input)
    movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)
    
    user_vecs = Flatten()(user_embedding)
    movie_vecs = Flatten()(movie_embedding)
    
    concat = concatenate([user_vecs, movie_vecs])
    dense = Dense(128, activation='relu')(concat)
    output = Dense(1)(dense)
    
    model = Model([user_input, movie_input], output)
    model.compile(optimizer='adam', loss='mse')
    
    return model

num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()

ncf_model = NCF_model(num_users, num_movies)

# Prepare the training data for NCF
X_train_ncf = [train_data['userId'].values, train_data['movieId'].values]
y_train_ncf = train_data['rating'].values

# Train the NCF model
ncf_model.fit(X_train_ncf, y_train_ncf, epochs=10, batch_size=64, validation_split=0.2)

# Save the model
ncf_model.save('ncf_model.h5')


Epoch 1/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 3.3184 - val_loss: 0.8136
Epoch 2/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.7020 - val_loss: 0.7845
Epoch 3/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.6573 - val_loss: 0.7753
Epoch 4/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.6252 - val_loss: 0.7697
Epoch 5/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.5893 - val_loss: 0.7728
Epoch 6/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.5714 - val_loss: 0.7929
Epoch 7/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.5402 - val_loss: 0.8025
Epoch 8/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.5132 - val_loss: 0.8174
Epoch 9/10
[1m1009/1009



## Model Inference

### Loading and Predicting with Factorization Machines

In [34]:
# Load the trained Factorization Machine model
fm_model = joblib.load('fm_model.pkl')

selected_user_id = 1

# Get the user's rated movies
user_rated_movies = ratings[ratings['userId'] == selected_user_id]['movieId'].values

# Create a DataFrame with the selected user and all movies
user_predictions = pd.DataFrame({'userId': selected_user_id, 'movieId': range(num_movies)})

# Predict the ratings
user_predictions['predicted_rating'] = fm_model.predict(user_predictions[['userId', 'movieId']])

# Merge with movies data to get titles and genres
user_predictions = pd.merge(user_predictions, movies, on='movieId')

# Display the top 10 movie recommendations
user_recommendations = user_predictions.sort_values(by='predicted_rating', ascending=False).head(10)
print(user_recommendations[['title', 'genres', 'predicted_rating']])

                                title   
0                    Toy Story (1995)  \
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   
5                         Heat (1995)   
6                      Sabrina (1995)   
7                 Tom and Huck (1995)   
8                 Sudden Death (1995)   
9                    GoldenEye (1995)   

                                        genres  predicted_rating  
0  Adventure|Animation|Children|Comedy|Fantasy          3.675007  
1                   Adventure|Children|Fantasy          3.674992  
2                               Comedy|Romance          3.674977  
3                         Comedy|Drama|Romance          3.674962  
4                                       Comedy          3.674947  
5                        Action|Crime|Thriller          3.674932  
6                               Comedy|Romance          3.674917  
7           

### Loading and Predicting with NCF

In [38]:
# Load the trained NCF model
ncf_model = load_model('ncf_model.h5', custom_objects={'mse': tf.keras.losses.MeanSquaredError()})

selected_user_id = 1

num_movies = ratings['movieId'].nunique()

# Get the user's rated movies
user_rated_movies = ratings[ratings['userId'] == selected_user_id]['movieId'].values

# Create input data for the selected user
X_user_ncf = [np.array([selected_user_id] * num_movies), np.array(range(num_movies))]

# Predict the ratings
user_predictions = pd.DataFrame({'userId': selected_user_id, 'movieId': range(num_movies)})
user_predictions['predicted_rating'] = ncf_model.predict(X_user_ncf).flatten()

# Merge with movies data to get titles and genres
user_predictions = pd.merge(user_predictions, movies, on='movieId')

# Display the top 10 movie recommendations
user_recommendations = user_predictions.sort_values(by='predicted_rating', ascending=False).head(10)
print(user_recommendations[['title', 'genres', 'predicted_rating']])



[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
                                                  title   
3963                        Ernest Scared Stupid (1991)  \
4846  Intolerance: Love's Struggle Throughout the Ag...   
4519                          Jeepers Creepers 2 (2003)   
5163                                  Soul Plane (2004)   
4412                       Bodies, Rest & Motion (1993)   
4674            Forbidden Games (Jeux interdits) (1952)   
4902                               Secret Window (2004)   
3979                                     Trapped (2002)   
4681                     The Great Train Robbery (1978)   
5353                           Country Girl, The (1954)   

                                   genres  predicted_rating  
3963                               Comedy          5.216476  
4846                                Drama          5.210003  
4519                      Horror|Thriller          5.205963  
5163                       

## Models Evaluation

In [40]:
# Function to evaluate the models
def evaluate_model(model, X_test, y_test, model_type='fm'):
    if model_type == 'fm':
        # Predict with Factorization Machines
        y_pred = model.predict(X_test)
    elif model_type == 'ncf':
        # Predict with Neural Collaborative Filtering
        user_input = X_test['userId'].values
        movie_input = X_test['movieId'].values
        y_pred = model.predict([user_input, movie_input]).flatten()
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    
    return mse, rmse, mae

# Prepare test data
X_test_fm = test_data[['userId', 'movieId']]
y_test_fm = test_data['rating']

# Evaluate Factorization Machines model
mse_fm, rmse_fm, mae_fm = evaluate_model(fm_model, X_test_fm, y_test_fm, model_type='fm')
print(f"Factorization Machines - MSE: {mse_fm}, RMSE: {rmse_fm}, MAE: {mae_fm}")

# Evaluate Neural Collaborative Filtering model
mse_ncf, rmse_ncf, mae_ncf = evaluate_model(ncf_model, X_test_fm, y_test_fm, model_type='ncf')
print(f"Neural Collaborative Filtering - MSE: {mse_ncf}, RMSE: {rmse_ncf}, MAE: {mae_ncf}")


Factorization Machines - MSE: 1.098197298864979, RMSE: 1.0479490917334577, MAE: 0.8353305184748032
[1m631/631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Neural Collaborative Filtering - MSE: 0.8358732941926954, RMSE: 0.9142610645722016, MAE: 0.68950717431707
