# Neural Collaborative Filter

This notebook builds a neural network that uses user ratings and the genres of each movie to suggest new movies for user to wach based on their previous watching habbits. This is called an item-based collaborative filter.

In [585]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [586]:
train_df = pd.read_csv('user_train_df.csv')
test_df = pd.read_csv('user_test_df.csv')


In [587]:
train_df.index = range(1, len(train_df) + 1)
test_df.index = range(1, len(test_df) + 1)

In [588]:
train_df

Unnamed: 0,User ID,Item ID,Rating,timestamp,Age,Gender,Occupation,zip code,Movie Title,Release Date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,1,168,5,874965478,24,M,technician,85711,Monty Python and the Holy Grail (1974),01-Jan-1974,...,0,0.0,0,0,0.0,0,0,0.0,0,0
2,1,172,5,874965478,24,M,technician,85711,"Empire Strikes Back, The (1980)",01-Jan-1980,...,0,0.0,0,0,0.0,1,1,0.0,1,0
3,1,165,5,874965518,24,M,technician,85711,Jean de Florette (1986),01-Jan-1986,...,0,0.0,0,0,0.0,0,0,0.0,0,0
4,1,156,4,874965556,24,M,technician,85711,Reservoir Dogs (1992),01-Jan-1992,...,0,0.0,0,0,0.0,0,0,1.0,0,0
5,1,166,5,874965677,24,M,technician,85711,Manon of the Spring (Manon des sources) (1986),01-Jan-1986,...,0,0.0,0,0,0.0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32546,943,229,2,888693158,22,M,student,77841,Star Trek III: The Search for Spock (1984),01-Jan-1984,...,0,0.0,0,0,0.0,0,1,0.0,0,0
32547,943,230,1,888693158,22,M,student,77841,Star Trek IV: The Voyage Home (1986),01-Jan-1986,...,0,0.0,0,0,0.0,0,1,0.0,0,0
32548,943,228,3,888693158,22,M,student,77841,Star Trek: The Wrath of Khan (1982),01-Jan-1982,...,0,0.0,0,0,0.0,0,1,0.0,0,0
32549,943,449,1,888693158,22,M,student,77841,Star Trek: The Motion Picture (1979),01-Jan-1979,...,0,0.0,0,0,0.0,0,1,0.0,0,0


In [589]:
num_users = 943
num_items = 1682
num_genres = 19

Change all the data types so that the genres are all numerical for both test and train df.

In [590]:
# Convert all specified columns to numeric, setting errors='coerce' to convert non-numeric values to NaN

genre_columns = ['Fantasy', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western','Film-Noir','Unknown','Action','Adeventure','Animation','Childrens','Comedy','Crime','Documentary','Drama']
train_df[genre_columns] = train_df[genre_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows where any of the specified columns contain NaN values
train_df = train_df.dropna(subset=genre_columns)


test_df[genre_columns] = test_df[genre_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows where any of the specified columns contain NaN values
test_df = test_df.dropna(subset=genre_columns)

In [591]:
# Prepare Input Features
genre_features = genre_columns
train_genre_input = train_df[genre_features].values
test_genre_input = test_df[genre_features].values

Now build the model:

In [592]:
# Model Architecture
# Inputs
user_input = Input(shape=(1,), name='User_Input')
item_input = Input(shape=(1,), name='Item_Input')



genre_input = Input(shape=(num_genres,), name='Genre_Input')

In [593]:
# Embedding layers for user and item
user_embedding = Embedding(num_users+1, 50, name='User_Embedding')(user_input)
item_embedding = Embedding(num_items+1, 50, name='Item_Embedding')(item_input)

In [594]:
# Flatten embeddings
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

In [595]:
# Concatenate embeddings and genre input
concat = Concatenate()([user_vec, item_vec, genre_input])

In [596]:
# Dense layers
dense1 = Dense(128, activation='relu')(concat)
dropout1 = Dropout(0.3)(dense1)
dense2 = Dense(64, activation='relu')(dropout1)

output = Dense(1, activation='sigmoid')(dense2)

In [597]:
# Model definition
model = Model(inputs=[user_input, item_input, genre_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


Prepare the training data. If the rating is above 3, we claim that the user enjoyed the film. We treat this as a binary variable, (1 if the user enjoyed the film and 0 otherwise)

In [598]:

# Preparing training data
train_user_input = train_df['User ID'].values
train_item_input = train_df['Item ID'].values
train_ratings = (train_df['Rating'] > 3).astype(int).values  # Binary rating: 1 if >3, else 0



test_user_input = test_df['User ID'].values
test_item_input = test_df['Item ID'].values
test_ratings = (test_df['Rating'] > 3).astype(int).values


In [599]:
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 User_Input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 Item_Input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 User_Embedding (Embedding)     (None, 1, 50)        47200       ['User_Input[0][0]']             
                                                                                                  
 Item_Embedding (Embedding)     (None, 1, 50)        84150       ['Item_Input[0][0]']             
                                                                                           

In [600]:
# Training the model
history = model.fit(
    [train_user_input, train_item_input, train_genre_input],
    train_ratings,
    validation_data=([test_user_input, test_item_input, test_genre_input], test_ratings),
    epochs=10,
    batch_size=64
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


The model has now been built and trained. Now to test it out. We take a random user, `user_id`, to see how the model behaves

In [623]:
# Predicting Recommendations for a User
user_id = 212  # Example: User ID for which to recommend
user_movies = np.array(range(num_items))  # All movies


In [624]:
# Combine the DataFrames vertically (stacking them on top of each other)
combined_df = pd.concat([train_df, test_df], ignore_index=True)

genre_df = combined_df.copy()

genre_df.drop(columns=['timestamp','Age','Gender','Occupation','zip code','Release Date','URL','Movie Title','User ID','Item ID','Rating'],inplace=True)



In [625]:
# Genres for all movies
movie_genres_input = genre_df[:user_movies.shape[0]]

In [626]:
# Ensure all inputs are properly shaped and converted to the correct dtype
user_input_predict = np.full((user_movies.shape[0], 1), user_id, dtype=np.int32)  # Shape: (num_items, 1)
item_input_predict = user_movies.reshape(-1, 1).astype(np.int32)  # Shape: (num_items, 1)
movie_genres_input = movie_genres_input.astype(np.float32)  # Ensure genre input is float32


In [627]:
# Predict scores
predicted_scores = model.predict([np.full(user_movies.shape, user_id), user_movies, movie_genres_input])




In [628]:
# Recommend Top-5 Movies
recommended_movies = np.argsort(-predicted_scores.flatten())[:10]
print("Top 10 recommended movies for User ID", user_id, ":", recommended_movies)

Top 10 recommended movies for User ID 212 : [1467 1500  174  199 1625 1616 1396 1466 1599 1515]


In [629]:
recommended_movies

array([1467, 1500,  174,  199, 1625, 1616, 1396, 1466, 1599, 1515],
      dtype=int64)

Now print all the details of these movies:

In [640]:
movie_info = combined_df.copy()
movie_info.drop(columns=['User ID','URL','Rating','timestamp','Age','Gender','Occupation','zip code','Release Date'], inplace = True)
#movie_info

filtered_df = train_df[train_df['Item ID'].isin(recommended_movies)]

filtered_df.drop_duplicates(subset='Item ID', inplace=True)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop_duplicates(subset='Item ID', inplace=True)


Unnamed: 0,User ID,Item ID,Rating,timestamp,Age,Gender,Occupation,zip code,Movie Title,Release Date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
30,1,199,4,875072262,24,M,technician,85711,"Bridge on the River Kwai, The (1957)",01-Jan-1957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53,1,174,5,875073198,24,M,technician,85711,Raiders of the Lost Ark (1981),01-Jan-1981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5725,184,1396,4,889913490,37,M,librarian,76013,Stonewall (1995),26-Jul-1996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7652,243,1466,3,879988104,33,M,educator,60201,Margarets Museum (1995)|01-Jan-1995||http://us...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7706,244,1467,5,880605553,28,M,technician,80525,"Saint of Fort Washington, The (1993)",01-Jan-1993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9123,279,1500,5,875306613,33,M,programmer,85251,Santa with Muscles (1996),08-Nov-1996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10681,308,1515,4,887738346,60,M,retired,95076,Wings of Courage (1995),01-Jan-1995,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16044,437,1599,5,880142614,27,F,other,20009,Someone Elses America (1995)|10-May-1996||http...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18380,500,1616,4,883875501,28,M,administrator,94305,Desert Winds (1995),01-Jan-1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21110,587,1625,4,892871732,26,M,other,14216,Nightwatch (1997),22-Apr-1997,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [631]:
# Function to get column names with positive entries for each row
def get_positive_columns(row):
    return [col for col in row.index if row[col] == 1]

# Apply the function to each row
filtered_df.drop(columns=['User ID','Rating'],inplace=True)
positive_columns_per_row = filtered_df.apply(get_positive_columns, axis=1)

# Show the result
for idx, positive_cols in enumerate(positive_columns_per_row):
    print(f"Row {idx} has positive values in columns: {positive_cols}")

Row 0 has positive values in columns: ['Drama', 'War']
Row 1 has positive values in columns: ['Action', 'Adeventure']
Row 2 has positive values in columns: ['Drama']
Row 3 has positive values in columns: ['Comedy']
Row 4 has positive values in columns: ['Drama']
Row 5 has positive values in columns: ['Comedy']
Row 6 has positive values in columns: ['Adeventure', 'Romance']
Row 7 has positive values in columns: ['Comedy']
Row 8 has positive values in columns: ['Drama']
Row 9 has positive values in columns: ['Horror', 'Thriller']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['User ID','Rating'],inplace=True)


Above we see the genre of the films suggested by the model. We now compare to see if they are similar to the movies in the test data that the user has watched and liked.

In [632]:
true_ratings = test_df.loc[test_df['User ID'] == user_id].sort_values(by=['Rating'], ascending=False).head(5)

true_ratings.drop(columns=['User ID','URL','Rating','timestamp','Age','Gender','Occupation','zip code','Release Date'], inplace = True)

true_ratings


Unnamed: 0,Item ID,Movie Title,Unknown,Action,Adeventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1267,423,E.T. the Extra-Terrestrial (1982),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1266,179,"Clockwork Orange, A (1971)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [633]:
# Apply the function to each row
positive_columns_per_row = true_ratings.apply(get_positive_columns, axis=1)

# Show the result
for idx, positive_cols in enumerate(positive_columns_per_row):
    print(f"Row {idx} has positive values in columns: {positive_cols}")

Row 0 has positive values in columns: ['Childrens', 'Drama', 'Fantasy', 'Sci-Fi']
Row 1 has positive values in columns: ['Sci-Fi']


We see for this example, they are indeed very similar. This indicates that the model is somewhat working.

## Now looking at tuning some hyper-parameters
As after running a few different user ID's, the same movie recommendations kepy popping up, when could here have been better matches? Are these movies being favoured for a particular reason?

In [634]:
# Identify rows where 'Column1' has duplicates
duplicates = combined_df[combined_df['Movie Title'].duplicated(keep=False)]  # keep=False to mark all duplicates

# Return the values from 'Column2' where duplicates occur in 'Column1'
duplicated_column2_values = duplicates['Item ID'].tolist()
print("Values from Column2 where Column1 has duplicates:", duplicated_column2_values)

Values from Column2 where Column1 has duplicates: [168, 172, 165, 156, 166, 196, 187, 250, 14, 181, 1, 246, 248, 249, 253, 224, 7, 235, 15, 237, 13, 251, 236, 240, 190, 238, 183, 11, 199, 161, 179, 202, 163, 8, 182, 214, 192, 160, 175, 180, 185, 204, 212, 197, 184, 191, 207, 186, 188, 159, 17, 174, 252, 220, 243, 247, 10, 234, 176, 193, 217, 177, 216, 194, 195, 218, 170, 213, 223, 157, 227, 231, 200, 4, 215, 2, 164, 206, 254, 173, 211, 229, 155, 203, 219, 167, 230, 162, 233, 158, 198, 225, 239, 205, 210, 201, 12, 208, 3, 241, 226, 232, 169, 16, 228, 9, 178, 222, 221, 6, 244, 18, 209, 189, 242, 171, 5, 301, 299, 1, 298, 10, 14, 13, 237, 251, 242, 302, 300, 300, 302, 299, 181, 300, 301, 210, 11, 222, 250, 235, 225, 1, 181, 424, 231, 229, 226, 176, 450, 183, 435, 2, 228, 230, 431, 227, 210, 449, 174, 172, 17, 167, 186, 451, 209, 428, 430, 211, 433, 239, 173, 204, 208, 168, 422, 432, 423, 434, 429, 425, 214, 448, 185, 234, 200, 436, 445, 443, 447, 219, 444, 441, 446, 241, 216, 427, 454, 16

The above chunk lets us know that there are movie titles with multiple item Id's that identify them. This may partially be why the same movie suggestions are coming up over and over again, as there is actually ess choice than initially thought. However we will still look to do a hyper-paramer search to optimize the model best we can.

In [635]:
pip install keras-tuner


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
     -------------------------------------- 129.1/129.1 kB 2.5 MB/s eta 0:00:00
Collecting kt-legacy
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5
Note: you may need to restart the kernel to use updated packages.




In [636]:
import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Assuming df has been prepared with 'UserID', 'ItemID', 'Rating', and genre columns

# HyperModel for the neural network
class CollaborativeFilterHyperModel(kt.HyperModel):
    def build(self, hp):
        # Input dimensions
        num_users = 943
        num_items = 1682
        num_genres = 19
        # Model inputs
        user_input = Input(shape=(1,), name='User_Input')
        item_input = Input(shape=(1,), name='Item_Input')
        genre_input = Input(shape=(num_genres,), name='Genre_Input')

        # Hyperparameters for embeddings and dense layers
        user_embedding_dim = hp.Int('user_embedding_dim', min_value=32, max_value=128, step=32)
        item_embedding_dim = hp.Int('item_embedding_dim', min_value=32, max_value=128, step=32)

        
        # Embedding layers for user and item
        user_embedding = Embedding(num_users+1, user_embedding_dim, name='User_Embedding')(user_input)
        item_embedding = Embedding(num_items+1, item_embedding_dim, name='Item_Embedding')(item_input)

        
        # Flatten the embeddings
        user_vec = Flatten()(user_embedding)
        item_vec = Flatten()(item_embedding)

        # Concatenate embeddings with genre input
        concat = Concatenate()([user_vec, item_vec, genre_input])

        # Dense layers with hyperparameter search
        dense1_units = hp.Int('dense1_units', min_value=64, max_value=512, step=64)
        dense2_units = hp.Int('dense2_units', min_value=32, max_value=256, step=32)

        dense1 = Dense(dense1_units, activation='relu')(concat)
        dropout1 = Dropout(hp.Float('dropout1', min_value=0.2, max_value=0.5, step=0.1))(dense1)
        dense2 = Dense(dense2_units, activation='relu')(dropout1)
        
        # Output layer
        output = Dense(1, activation='linear')(dense2)  # Rating is continuous, use linear activation

        # Model definition
        model = Model(inputs=[user_input, item_input, genre_input], outputs=output)

        # Compile the model with an optimizer and loss function
        model.compile(optimizer=Adam(), loss='mse', metrics=['mae'])

        return model

# Preparing the training and testing data (just like before)


train_user_input = train_df['User ID'].values
train_item_input = train_df['Item ID'].values
train_genre_input = train_df[genre_columns].values
train_ratings = train_df['Rating'].values

test_user_input = test_df['User ID'].values
test_item_input = test_df['Item ID'].values
test_genre_input = test_df[genre_columns].values
test_ratings = test_df['Rating'].values

# Instantiate the tuner
tuner = kt.RandomSearch(
    CollaborativeFilterHyperModel(),
    objective='val_mae',  # We are optimizing for Mean Absolute Error
    max_trials=5,  # Number of different hyperparameter combinations to try
    executions_per_trial=3,  # Number of executions for each trial
    #directory='"C:/Users/kwhit/OneDrive/Documents/Maths 4th year/Data Science Toolbox 2024/Group-Assignment-2/Katherine W"',  # Directory to save tuning results
    project_name='collab_filter_search'  # Project name for Keras Tuner
)

# Search for the best hyperparameters
tuner.search(
    [train_user_input, train_item_input, train_genre_input],
    train_ratings,
    validation_data=([test_user_input, test_item_input, test_genre_input], test_ratings),
    epochs=10,
    batch_size=64
)

# Retrieve the best hyperparameters
best_hp = tuner.get_best_hyperparameters()[0]
print("Best hyperparameters:", best_hp.values)

# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hp)

# Train the model using the best hyperparameters
history = best_model.fit(
    [train_user_input, train_item_input, train_genre_input],
    train_ratings,
    validation_data=([test_user_input, test_item_input, test_genre_input], test_ratings),
    epochs=10,
    batch_size=64
)

# Evaluate the best model
test_loss, test_mae = best_model.evaluate([test_user_input, test_item_input, test_genre_input], test_ratings)
print(f"Test MAE: {test_mae}")


Trial 5 Complete [00h 03m 48s]
val_mae: 0.579120914141337

Best val_mae So Far: 0.5092334349950155
Total elapsed time: 00h 15m 43s
Best hyperparameters: {'user_embedding_dim': 64, 'item_embedding_dim': 96, 'dense1_units': 320, 'dense2_units': 32, 'dropout1': 0.2}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 0.5151842832565308


In [637]:
# Save the best model to a file
best_model.save('best_collab_filter_model.h5')
print("Model saved!")


Model saved!


In [638]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('best_collab_filter_model.h5')
print("Model loaded!")


Model loaded!


In [639]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 User_Input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 Item_Input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 User_Embedding (Embedding)     (None, 1, 64)        60416       ['User_Input[0][0]']             
                                                                                                  
 Item_Embedding (Embedding)     (None, 1, 96)        161568      ['Item_Input[0][0]']             
                                                                                            

In [651]:
# Predicting Recommendations for a User
user_id = 33  # Example: User ID for which to recommend
user_movies = np.array(range(num_items))  # All movies


In [652]:
# Combine the DataFrames vertically (stacking them on top of each other)
combined_df = pd.concat([train_df, test_df], ignore_index=True)

genre_df = combined_df.copy()

genre_df.drop(columns=['timestamp','Age','Gender','Occupation','zip code','Release Date','URL','Movie Title','User ID','Item ID','Rating'],inplace=True)



In [653]:
# Genres for all movies
movie_genres_input = genre_df[:user_movies.shape[0]]

In [654]:
# Ensure all inputs are properly shaped and converted to the correct dtype
user_input_predict = np.full((user_movies.shape[0], 1), user_id, dtype=np.int32)  # Shape: (num_items, 1)
item_input_predict = user_movies.reshape(-1, 1).astype(np.int32)  # Shape: (num_items, 1)
movie_genres_input = movie_genres_input.astype(np.float32)  # Ensure genre input is float32


In [655]:
# Predict scores
predicted_scores = model.predict([np.full(user_movies.shape, user_id), user_movies, movie_genres_input])




In [656]:
# Recommend Top-5 Movies
recommended_movies = np.argsort(-predicted_scores.flatten())[:10]
print("Top 10 recommended movies for User ID", user_id, ":", recommended_movies)

Top 10 recommended movies for User ID 33 : [1642  868 1064 1646  241  864 1612 1650  416 1450]


In [657]:
movie_info = combined_df.copy()
movie_info.drop(columns=['User ID','URL','Rating','timestamp','Age','Gender','Occupation','zip code','Release Date'], inplace = True)
#movie_info

filtered_df = train_df[train_df['Item ID'].isin(recommended_movies)]

filtered_df.drop_duplicates(subset='Item ID', inplace=True)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop_duplicates(subset='Item ID', inplace=True)


Unnamed: 0,User ID,Item ID,Rating,timestamp,Age,Gender,Occupation,zip code,Movie Title,Release Date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
101,1,241,4,878543133,24,M,technician,85711,"Last of the Mohicans, The (1992)",01-Jan-1992,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
618,13,868,5,882139901,47,M,educator,29206,Hearts and Minds (1996),10-Jan-1997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,13,864,4,882141924,47,M,educator,29206,My Fellow Americans (1996),20-Dec-1996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1662,48,1064,4,879434688,45,M,administrator,12550,Crossfire (1947),01-Jan-1947,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7443,234,1450,3,892335213,60,M,retired,94702,Golden Earrings (1947),01-Jan-1947,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
17899,489,1612,5,891446623,55,M,other,45218,"Leading Man, The (1996)",16-Jan-1998,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
23517,655,1642,4,888474934,50,F,healthcare,60657,Some Mothers Son (1996)|27-Dec-1996||http://us...,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23565,655,1646,3,891913577,50,F,healthcare,60657,Men With Guns (1997),06-Mar-1998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23569,655,1650,4,892871225,50,F,healthcare,60657,"Butcher Boy, The (1998)",01-Jan-1998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [658]:
# Function to get column names with positive entries for each row
def get_positive_columns(row):
    return [col for col in row.index if row[col] == 1]

# Apply the function to each row
filtered_df.drop(columns=['User ID','Rating'],inplace=True)
positive_columns_per_row = filtered_df.apply(get_positive_columns, axis=1)

# Show the result
for idx, positive_cols in enumerate(positive_columns_per_row):
    print(f"Row {idx} has positive values in columns: {positive_cols}")

Row 0 has positive values in columns: ['Action', 'Romance', 'War']
Row 1 has positive values in columns: ['Drama']
Row 2 has positive values in columns: ['Comedy']
Row 3 has positive values in columns: ['Crime', 'Film-Noir']
Row 4 has positive values in columns: ['Adeventure', 'Romance']
Row 5 has positive values in columns: ['Romance']
Row 6 has positive values in columns: ['Comedy']
Row 7 has positive values in columns: ['Action', 'Drama']
Row 8 has positive values in columns: ['Drama']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['User ID','Rating'],inplace=True)


In [659]:
true_ratings = test_df.loc[test_df['User ID'] == user_id].sort_values(by=['Rating'], ascending=False).head(5)

true_ratings.drop(columns=['User ID','URL','Rating','timestamp','Age','Gender','Occupation','zip code','Release Date'], inplace = True)

true_ratings

Unnamed: 0,Item ID,Movie Title,Unknown,Action,Adeventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
255,872,Love Jones (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [660]:
# Apply the function to each row
positive_columns_per_row = true_ratings.apply(get_positive_columns, axis=1)

# Show the result
for idx, positive_cols in enumerate(positive_columns_per_row):
    print(f"Row {idx} has positive values in columns: {positive_cols}")

Row 0 has positive values in columns: ['Romance']


## Measuring performance of NCF model

In [86]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import log2

In [87]:
# Function to compute RMSE and MAE
def compute_rmse_mae(true_ratings, predicted_ratings):
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    return rmse, mae


In [88]:
# Function to compute Precision at K
def precision_at_k(y_true, y_pred, k=10):
    # Get the indices of the top-k predictions
    top_k_preds = np.argsort(y_pred)[:, -k:]
    # Check if the true ratings are in the top-k predictions
    relevant_items = np.array([y_true[i] for i in top_k_preds])
    precision = np.sum(relevant_items) / k
    return precision

In [89]:
# Function to compute Recall at K
def recall_at_k(y_true, y_pred, k=10):
    top_k_preds = np.argsort(y_pred)[:, -k:]
    relevant_items = np.array([y_true[i] for i in top_k_preds])
    recall = np.sum(relevant_items) / np.sum(y_true)
    return recall


In [90]:
# Function to compute NDCG at K
def ndcg_at_k(y_true, y_pred, k=10):
    """
    Compute Normalized Discounted Cumulative Gain (NDCG) at K.
    Args:
    - y_true (1D numpy array): Actual ratings or relevance scores.
    - y_pred (1D numpy array): Predicted ratings or relevance scores.
    - k (int): The number of top recommendations to consider.
    
    Returns:
    - NDCG score.
    """
    # Get the indices of the top-K predicted ratings
    top_k_preds = np.argsort(y_pred)[::-1][:k]
    # Ideal order is the indices of the top-K actual ratings
    ideal_order = np.argsort(y_true)[::-1][:k]

    # Discounted Cumulative Gain (DCG)
    dcg = np.sum([y_true[i] / log2(idx + 2) for idx, i in enumerate(top_k_preds)])

    # Ideal DCG (IDCG) based on the ideal ranking
    idcg = np.sum([y_true[i] / log2(idx + 2) for idx, i in enumerate(ideal_order)])

    # Avoid division by zero if IDCG is zero (e.g., if all relevance scores are zero)
    ndcg = dcg / idcg if idcg > 0 else 0.0

    return ndcg

In [91]:
# Variables needed:
# test_user_input, test_item_input, test_genre_input: test data for users, items, and genres.
# test_ratings: true ratings for the test set.
# predicted_scores: predicted ratings from the model.

In [92]:
# Evaluate the model's predictions
predicted_scores = model.predict([test_user_input, test_item_input, test_genre_input])  # Predicted ratings from the model




In [93]:
# Compute RMSE and MAE
rmse, mae = compute_rmse_mae(test_ratings, predicted_scores)
print(f"RMSE: {rmse}, MAE: {mae}")


RMSE: 0.31596579317509804, MAE: 0.23012207826230416


In [94]:
# Precision and Recall at K
k = 10
precision = precision_at_k(test_ratings, predicted_scores, k)
recall = recall_at_k(test_ratings, predicted_scores, k)
print(f"Precision@{k}: {precision}, Recall@{k}: {recall}")


Precision@10: 0.0, Recall@10: 0.0


In [95]:
import numpy as np
from math import log2


# Now you can call this function to calculate NDCG at K.
k = 10  # Change K based on your needs (e.g., Top-5, Top-10, etc.)
ndcg = ndcg_at_k(test_ratings, predicted_scores, k)
print(f"NDCG@{k}: {ndcg}")


NDCG@10: 0.0


The last two metrics have a score of 0, however this may be becaise the data we have is very sparse. We may find that with more data these scores increase.

## Building a User-based Collaborative filter

We now try and builda neural network hat will suggest movies based on the preferences of other similar users, rather than the watching history of the user ( which we did in the previous model)

In [532]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [533]:
num_users = 943
num_items = 1682
num_occupations = 21  # Number of unique occupations
num_genders = 2        # Gender (Male or Female)

In [534]:
# Encoding UserID, ItemID, Gender, Occupation
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()

train_df['User ID'] = user_encoder.fit_transform(train_df['User ID'])
train_df['Item ID'] = item_encoder.fit_transform(train_df['Item ID'])
train_df['Gender'] = gender_encoder.fit_transform(train_df['Gender'])
train_df['Occupation'] = occupation_encoder.fit_transform(train_df['Occupation'])

test_df['User ID'] = user_encoder.fit_transform(test_df['User ID'])
test_df['Item ID'] = item_encoder.fit_transform(test_df['Item ID'])
test_df['Gender'] = gender_encoder.fit_transform(test_df['Gender'])
test_df['Occupation'] = occupation_encoder.fit_transform(test_df['Occupation'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['User ID'] = user_encoder.fit_transform(train_df['User ID'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Item ID'] = item_encoder.fit_transform(train_df['Item ID'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Gender'] = gender_encoder.fit_transform(train_df['Gende

In [555]:
# Normalize the exact age feature
scaler = MinMaxScaler()
train_df['Age'] = scaler.fit_transform(train_df[['Age']])  # Normalize age between 0 and 1
# Normalize the exact age feature
scaler = MinMaxScaler()
test_df['Age'] = scaler.fit_transform(test_df[['Age']])  # Normalize age between 0 and 1


#Now normalize ratings for each user
train_df['Rating'] = scaler.fit_transform(train_df[['Rating']])  # Normalize age between 0 and 1
test_df['Rating'] = scaler.fit_transform(test_df[['Rating']])  # Normalize age between 0 and 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Age'] = scaler.fit_transform(train_df[['Age']])  # Normalize age between 0 and 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Rating'] = scaler.fit_transform(train_df[['Rating']])  # Normalize age between 0 and 1


In [556]:
# Prepare Input Features
train_user_input = train_df[['User ID', 'Age', 'Gender', 'Occupation']].values
train_item_input = train_df['Item ID'].values
train_ratings = train_df['Rating'].values

test_user_input = test_df[['User ID', 'Age', 'Gender', 'Occupation']].values
test_item_input = test_df['Item ID'].values
test_ratings = test_df['Rating'].values

In [557]:
# Model Architecture
# Inputs
user_input = Input(shape=(4,), name='User_Input')  # 4 features: UserID, Age, Gender, Occupation
item_input = Input(shape=(1,), name='Item_Input')


In [558]:
# Embedding layers for user and item
user_embedding = Embedding(num_users, 50, name='User_Embedding')(user_input[:, 0])  # UserID
age_input = user_input[:, 1:]  # Age, Gender, Occupation (no embedding for continuous age)
age_vec = Dense(10, activation='relu')(age_input)  # Dense layer to process age, gender, occupation inputs
item_embedding = Embedding(num_items, 50, name='Item_Embedding')(item_input)


In [559]:
# Flatten embeddings
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

In [560]:
# Concatenate all embeddings (user, item, age, gender, occupation)
concat = Concatenate()([user_vec, age_vec, item_vec])


In [561]:
# Dense layers
dense1 = Dense(128, activation='relu')(concat)
dropout1 = Dropout(0.3)(dense1)
dense2 = Dense(64, activation='relu')(dropout1)
output = Dense(1, activation='linear')(dense2)  # Rating is a continuous value, so use linear activation


In [562]:
# Model definition
model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [563]:
model.summary()

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 User_Input (InputLayer)        [(None, 4)]          0           []                               
                                                                                                  
 tf.__operators__.getitem_19 (S  (None,)             0           ['User_Input[0][0]']             
 licingOpLambda)                                                                                  
                                                                                                  
 Item_Input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 User_Embedding (Embedding)     (None, 50)           47150       ['tf.__operators__.getitem

In [564]:
# Training the model
history = model.fit(
    [train_user_input, train_item_input],
    train_ratings,
    validation_data=([test_user_input, test_item_input], test_ratings),
    epochs=10,
    batch_size=64
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [565]:
user_embedding_weights = model.get_layer('User_Embedding').get_weights()[0]
print(user_embedding_weights)


[[-0.03560853  0.01323607  0.0347047  ... -0.01349515  0.07017625
   0.03562173]
 [-0.08358413 -0.10854271 -0.03075899 ...  0.05906777  0.00916468
   0.0049558 ]
 [ 0.11670762  0.03522892  0.15495305 ... -0.1689883  -0.07446649
  -0.11847503]
 ...
 [-0.06660049  0.03320914 -0.10743857 ...  0.0952898   0.05160312
   0.09754574]
 [ 0.02742307 -0.01105282  0.02913474 ... -0.00212242 -0.01340621
   0.07760171]
 [-0.04358456  0.00753317  0.04807278 ...  0.02119211  0.01697386
   0.03496857]]


In [566]:
# Predicting Recommendations for a User
user_id = 3 # Example: User ID for which to recommend
user_age = train_df.loc[train_df['User ID'] == user_id, 'Age'].values[0]
user_gender = train_df.loc[train_df['User ID'] == user_id, 'Gender'].values[0]
user_occupation = train_df.loc[train_df['User ID'] == user_id, 'Occupation'].values[0]

user_data = np.array([[user_id, user_age, user_gender, user_occupation]])  # Example: Age=0.5 (normalized), Gender=0, Occupation=0 for simplicity (you would use actual data)
user_movies = np.array(range(num_items))  # All movies


In [567]:
print('Age:',user_age)
print('Gender',user_gender)
print('Occupaion',user_occupation)

Age: 0.2575757575757575
Gender 1
Occupaion 19


In [568]:
# Predict scores
predicted_scores = model.predict([np.tile(user_data, (num_items, 1)), user_movies])




In [577]:
# Recommend Top-5 Movies
recommended_movies = np.argsort(-predicted_scores.flatten())[:10]
print("Top 10 recommended movies for User ID", user_id, ":", recommended_movies)

Top 10 recommended movies for User ID 3 : [566 431 413 563 523 464 575 577 377 574]


In [578]:
recommended_movies

array([566, 431, 413, 563, 523, 464, 575, 577, 377, 574], dtype=int64)

In [579]:
movie_info = combined_df.copy()


In [580]:
filtered_user_df = train_df[train_df['Item ID'].isin(recommended_movies)]

In [581]:
filtered_user_df.drop_duplicates(subset='Item ID', inplace=True)
filtered_user_df.drop(columns=['Gender','Rating'],inplace=True)

filtered_user_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_user_df.drop_duplicates(subset='Item ID', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_user_df.drop(columns=['Gender','Rating'],inplace=True)


Unnamed: 0,User ID,Item ID,timestamp,Age,Occupation,zip code,Movie Title,Release Date,URL,Unknown,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Raing
5723,183,377,889911749,0.454545,10,76013,Anna (1996),13-Nov-1996,http://us.imdb.com/M/title-exact?Anna%20(1996),0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7400,233,413,892333573,0.80303,15,94702,Pather Panchali (1955),22-Mar-1996,http://us.imdb.com/M/title-exact?Pather%20Panc...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75
7706,243,431,880605553,0.318182,19,80525,"Saint of Fort Washington, The (1993)",01-Jan-1993,http://us.imdb.com/M/title-exact?Saint%20of%20...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9123,277,464,875306613,0.393939,14,85251,Santa with Muscles (1996),08-Nov-1996,http://us.imdb.com/M/title-exact?Santa%20with%...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16044,435,523,880142614,0.30303,13,20009,Someone Elses America (1995)|10-May-1996||http...,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
23477,653,563,887650483,0.651515,7,60657,Bitter Sugar (Azucar Amargo) (1996),22-Nov-1996,http://us.imdb.com/M/title-exact?Bitter%20Suga...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75
23517,653,566,888474934,0.651515,7,60657,Some Mothers Son (1996)|27-Dec-1996||http://us...,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75
23564,653,575,891913500,0.651515,7,60657,"Spanish Prisoner, The (1997)",27-Mar-1998,"http://us.imdb.com/Title?Spanish+Prisoner,+The...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75
23569,653,574,892871225,0.651515,7,60657,"Butcher Boy, The (1998)",01-Jan-1998,http://us.imdb.com/M/title-exact?imdb-title-11...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75
24192,673,577,889489913,0.409091,13,28814,Entertaining Angels: The Dorothy Day Story (1996),27-Sep-1996,http://us.imdb.com/M/title-exact?Entertaining%...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [582]:
# Function to get column names with positive entries for each row
def get_positive_columns(row):
    return [col for col in row.index if row[col] == 1]


# Apply the function to each row
positive_columns_per_row_user = filtered_user_df.apply(get_positive_columns, axis=1)

# Show the result
for idx, positive_cols in enumerate(positive_columns_per_row_user):
    print(f"Row {idx} has positive values in columns: {positive_cols}")

Row 0 has positive values in columns: ['Drama', 'Raing']
Row 1 has positive values in columns: ['Drama']
Row 2 has positive values in columns: ['Drama', 'Raing']
Row 3 has positive values in columns: ['Comedy', 'Raing']
Row 4 has positive values in columns: ['Comedy', 'Raing']
Row 5 has positive values in columns: ['Drama']
Row 6 has positive values in columns: ['Comedy']
Row 7 has positive values in columns: ['Drama', 'Thriller']
Row 8 has positive values in columns: ['Drama']
Row 9 has positive values in columns: ['Drama', 'Raing']


In [583]:
true_user_ratings = test_df.loc[test_df['User ID'] == user_id].sort_values(by=['Rating'], ascending=False).head(5)

true_user_ratings.drop(columns=['User ID','URL','Rating','timestamp','Age','Gender','Occupation','zip code','Release Date'], inplace = True)

true_user_ratings

Unnamed: 0,Item ID,Movie Title,Unknown,Action,Adeventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
28,10,Seven (Se7en) (1995),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [584]:
# Apply the function to each row
positive_columns_per_row = true_user_ratings.apply(get_positive_columns, axis=1)

# Show the result
for idx, positive_cols in enumerate(positive_columns_per_row):
    print(f"Row {idx} has positive values in columns: {positive_cols}")

Row 0 has positive values in columns: ['Crime', 'Thriller']
