In [None]:
# libraries
import pandas as pd
import ast
import numpy as np
import tensorflow as tf
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, MultiLabelBinarizer
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dot, concatenate
from tensorflow.keras.models import Model

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


# Load the dataframe from file

In [2]:
# make sure the dataframe file is in your google drive and you can then load it
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
merged_df = pd.read_parquet("/content/drive/MyDrive/merged_df.parquet")

# If the above cell doesn't work, run this one:

In [None]:
# import pyarrow.parquet as pq
# import pandas as pd

# file_path = "/content/drive/MyDrive/merged_df.parquet"
# parquet_file = pq.ParquetFile(file_path)

# # Initialize an empty list to store DataFrames
# dataframes = []

# # Iterate through each row group
# for i in range(parquet_file.num_row_groups):
#     row_group = parquet_file.read_row_group(i)
#     df = row_group.to_pandas()  # Convert row group to Pandas DataFrame
#     dataframes.append(df)  # Add the DataFrame to the list
#     print(f"Loaded row group {i + 1}/{parquet_file.num_row_groups}, shape: {df.shape}")

# # Concatenate all row groups into a single DataFrame
# merged_df = pd.concat(dataframes, ignore_index=True)

# # Free up memory
# del dataframes

# # Display the final DataFrame shape
# print(f"Final DataFrame shape: {merged_df.shape}")


Loaded row group 1/11, shape: (1048576, 51)
Loaded row group 2/11, shape: (1048576, 51)
Loaded row group 3/11, shape: (1048576, 51)
Loaded row group 4/11, shape: (1048576, 51)
Loaded row group 5/11, shape: (1048576, 51)
Loaded row group 6/11, shape: (1048576, 51)
Loaded row group 7/11, shape: (1048576, 51)
Loaded row group 8/11, shape: (1048576, 51)
Loaded row group 9/11, shape: (1048576, 51)
Loaded row group 10/11, shape: (1048576, 51)
Loaded row group 11/11, shape: (951877, 51)
Final DataFrame shape: (11437637, 51)


In [None]:
print(merged_df.head())

   userId  movieId  rating  budget original_language          original_title  \
0       1      110     1.0       0                fr  Trois couleurs : Rouge   
1      11      110     3.5       0                fr  Trois couleurs : Rouge   
2      22      110     5.0       0                fr  Trois couleurs : Rouge   
3      24      110     5.0       0                fr  Trois couleurs : Rouge   
4      29      110     3.0       0                fr  Trois couleurs : Rouge   

   popularity  year  revenue  duration  ... Italiano  Português  Pусский  \
0    7.832755  1994      0.0        99  ...        0          0        0   
1    7.832755  1994      0.0        99  ...        0          0        0   
2    7.832755  1994      0.0        99  ...        0          0        0   
3    7.832755  1994      0.0        99  ...        0          0        0   
4    7.832755  1994      0.0        99  ...        0          0        0   

   suomi  svenska  العربية  हिन्दी  日本語  普通话  Other_language  

In [None]:
print(merged_df.columns)

Index(['userId', 'movieId', 'rating', 'budget', 'original_language',
       'original_title', 'popularity', 'year', 'revenue', 'duration', 'title',
       'vote_average', 'vote_count', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
       'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western', 'Deutsch',
       'English', 'Español', 'Français', 'Italiano', 'Português', 'Pусский',
       'suomi', 'svenska', 'العربية', 'हिन्दी', '日本語', '普通话',
       'Other_language'],
      dtype='object')


In [None]:
num_users = merged_df['userId'].nunique()
num_movies = merged_df['movieId'].nunique()
embedding_size = 8  # Size of the embedding vectors

In [None]:
# Define User Tower
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
user_vector = Flatten()(user_embedding)

In [None]:
# Define Content Tower
movies_input = Input(shape=(1,), name='movies_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size)(movies_input)
movie_vector = Flatten()(movie_embedding)

# First model. Adam optimizer, mean squared error loss and MAE metric. Linear activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output = Dense(1, activation='linear')(dot_product)

In [None]:
# Build Model
model = Model(inputs=[user_input, movies_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")


Epoch 1/10
[1m35743/35743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1106s[0m 31ms/step - loss: 2.7518 - mae: 1.2219
Epoch 2/10
[1m11506/35743[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m11:40[0m 29ms/step - loss: 0.6999 - mae: 0.6358

# Second model. Adam optimizer, mean squared error loss and MAE metric. Sigmoid activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output2 = Dense(1, activation='sigmoid')(dot_product)

In [None]:
# Build Model
model2 = Model(inputs=[user_input, movies_input], outputs=output2)
model2.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model2.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model2.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Third model. Adam optimizer, mean squared error. MAE metric. Tanh activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output3 = Dense(1, activation='tanh')(dot_product)

In [None]:
# Build Model
model3 = Model(inputs=[user_input, movies_input], outputs=output3)
model3.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model3.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model3.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Fourth model. Adam optimizer, mean squared error loss, MAE metric. ReLu activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output4 = Dense(1, activation='relu')(dot_product)

In [None]:
# Build Model
model4 = Model(inputs=[user_input, movies_input], outputs=output4)
model4.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model4.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model4.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Fifth model. SGD optimizer, mean squared error loss, MAE metric. ReLu activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output4 = Dense(1, activation='relu')(dot_product)

In [None]:
# Build Model
model5 = Model(inputs=[user_input, movies_input], outputs=output4)
model5.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mae'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model5.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model5.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Sixth model. rmsprop optimizer, mean squared error loss, MAE as metric. ReLu activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output4 = Dense(1, activation='relu')(dot_product)

In [None]:
# Build Model
model6 = Model(inputs=[user_input, movies_input], outputs=output4)
model6.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['MAE'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model6.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model6.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Seventh model. Adagrad optimizer, mean squared error loss, MAE metric. ReLu activation function

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_vector])
output4 = Dense(1, activation='relu')(dot_product)

In [None]:
# Build Model
model7 = Model(inputs=[user_input, movies_input], outputs=output4)
model7.compile(optimizer='adagrad', loss='mean_squared_error', metrics=['mae'])

In [None]:
user_movie_pairs = merged_df[['userId', 'movieId']].values
ratings = (merged_df['rating']).astype(float).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train[:, 0] = X_train[:, 0].astype(int)
X_train[:, 1] = X_train[:, 1].astype(int)
# Train the Model
model7.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=256)

# Evaluate the Model
loss, mae = model7.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Model with features instead of just MovieId

In [None]:
merged_df.head()

In [None]:
merged_df.columns

In [None]:
import kagglehub
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dot, concatenate
from tensorflow.keras.models import Model
import tensorflow as tf
import gc


# language
language_encoder = LabelEncoder()
merged_df['language_encoded'] = language_encoder.fit_transform(merged_df['original_language'])

# scale year
year_scaler = MinMaxScaler()
merged_df['year_scaled'] = year_scaler.fit_transform(merged_df[['year']])

# scale budget
budget_scaler = StandardScaler()
merged_df["budget_scaled"] = budget_scaler.fit_transform(merged_df[["budget"]])

# scale popularity
popularity_scaler = StandardScaler()
merged_df["popularity_scaled"] = popularity_scaler.fit_transform(merged_df[["popularity"]])

# duration scaling
duration_scaler = MinMaxScaler()
merged_df["duration_scaled"] = duration_scaler.fit_transform(merged_df[["duration"]])

# vote_average
vote_average_scaler = StandardScaler()
merged_df['vote_average_scaled'] = vote_average_scaler.fit_transform(merged_df[['vote_average']])

# vote_count
vote_count_scaler = StandardScaler()
merged_df['vote_count_scaled'] = vote_count_scaler.fit_transform(merged_df[['vote_count']])

revenue_scaler = StandardScaler()
merged_df["revenue_scaled"] = revenue_scaler.fit_transform(merged_df[["revenue"]])


# Model parameters
num_users = merged_df['userId'].nunique()
num_movies = merged_df['movieId'].nunique()
num_languages = merged_df['language_encoded'].nunique()
genre_columns = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
     'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music',
     'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
     'Western']
language_columns = ['Deutsch', 'English', 'Español', 'Français', 'Italiano',
     'Português', 'Pусский', 'suomi', 'svenska', 'العربية', 'हिन्दी',
     '日本語', '普通话', 'Other_language']
num_genres = len([col for col in merged_df.columns if col in genre_columns])
num_language_columns = len([col for col in merged_df.columns if col in language_columns])
embedding_size = 8

# Define User Tower
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
user_vector = Flatten()(user_embedding)

movie_year_input = Input(shape=(1,), name='movie_year_input')  # Year input (normalized)
movie_year_dense = Dense(embedding_size, activation='relu')(movie_year_input)

main_language_input = Input(shape=(1,), name='main_language_input')
main_language_embedding = Embedding(input_dim=num_languages, output_dim=embedding_size)(main_language_input)
main_language_vector = Flatten()(main_language_embedding)

budget_input = Input(shape=(1,), name="movie_budget_input")
budget_dense = Dense(embedding_size, activation='relu')(budget_input)

popularity_input = Input(shape=(1,), name="movie_popularity_input")
popularity_dense = Dense(embedding_size, activation='relu')(popularity_input)

duration_input = Input(shape=(1, ), name="movie_duration_input")
duration_dense = Dense(embedding_size, activation='relu')(duration_input)

vote_average_input = Input(shape=(1,), name='vote_average_input')
vote_average_dense = Dense(embedding_size, activation='relu')(vote_average_input)

vote_count_input = Input(shape=(1,), name='vote_count_input')
vote_count_dense = Dense(embedding_size, activation='relu')(vote_count_input)

revenue_input = Input(shape=(1, ), name="revenue_input")
revenue_dense = Dense(embedding_size, activation="relu")(revenue_input)

# NOTE: genre and languages are not currently used in the model. We could not get this to work. The input and dense layers we wanted to use are defined below
genre_features = merged_df[genre_columns].values
language_features = merged_df[language_columns].values

genre_input = Input(shape=(num_genres,), name='genre_input')
genre_dense = Dense(embedding_size, activation='relu')(genre_input)

language_input = Input(shape=(num_language_columns,), name='language_input')
language_dense = Dense(embedding_size, activation='relu')(language_input)


In [None]:
# Combine Movie Features
movie_features = concatenate([movie_year_dense, movie_language_dense, budget_dense, popularity_dense, duration_dense, vote_average_dense, vote_count_dense, revenue_dense])
movie_features_dense = Dense(embedding_size, activation='relu')(movie_features)


In [None]:
# Prepare Training Data
user_movie_pairs = merged_df[['userId', 'year_scaled','language_encoded', 'budget_scaled', 'popularity_scaled', 'duration_scaled', 'vote_average_scaled', 'vote_count_scaled', 'revenue_scaled']].values
ratings = (merged_df['rating']).astype(float).values

In [None]:
# Combine Towers
dot_product = Dot(axes=1)([user_vector, movie_features_dense])
output_final_model = Dense(1, activation='relu')(dot_product)

In [None]:
# Build Model
model_final = Model(inputs=[user_input, movie_year_input, movie_language_input, budget_input, popularity_input, duration_input, vote_average_input, vote_count_input, revenue_input], outputs=output_final_model)
model_final.compile(optimizer='adam', loss='mse', metrics=['MAE'])

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(user_movie_pairs, ratings, test_size=0.2, random_state=42)
X_train_user = X_train[:, 0].reshape(-1, 1)
X_train_year = X_train[:, 1].reshape(-1, 1)
X_train_language = X_train[:, 2].reshape(-1, 1)
X_train_budget = X_train[:, 3].reshape(-1, 1)
X_train_popularity = X_train[:, 4].reshape(-1, 1)
X_train_duration = X_train[:, 5].reshape(-1, 1)
X_train_vote_average = X_train[:, 6].reshape(-1, 1)
X_train_vote_count = X_train[:, 7].reshape(-1, 1)
X_train_revenue = X_train[:, 8].reshape(-1, 1)

X_test_user = X_test[:, 0].reshape(-1, 1)
X_test_year = X_test[:, 1].reshape(-1, 1)
X_test_language = X_test[:, 2].reshape(-1, 1)
X_test_budget = X_test[:, 3].reshape(-1, 1)
X_test_popularity = X_test[:, 4].reshape(-1, 1)
X_test_duration = X_test[:, 5].reshape(-1, 1)
X_test_vote_average = X_test[:, 6].reshape(-1, 1)
X_test_vote_count = X_test[:, 7].reshape(-1, 1)
X_test_revenue = X_test[:, 8].reshape(-1, 1)

# Train the Model
# Pass all inputs to the model
model_final.fit([X_train_user, X_train_year,X_train_language, X_train_budget, X_train_popularity, X_train_duration, X_train_vote_average, X_train_vote_count, X_train_revenue], y_train, epochs=10, batch_size=256)

# Evaluate the Model
# Pass all inputs during evaluation as well
loss, MAE = model_final.evaluate([X_test_user, X_test_year,X_test_language, X_test_budget, X_test_popularity, X_test_duration, X_test_vote_average, X_test_vote_count, X_test_revenue], y_test)
print(f"Test Loss: {loss}, Test MAE: {MAE}")

Epoch 1/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - MAE: 1.1419 - loss: 2.4617
Epoch 2/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - MAE: 0.7577 - loss: 0.9498
Epoch 3/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - MAE: 0.7448 - loss: 0.9287
Epoch 4/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 3ms/step - MAE: 0.7357 - loss: 0.9143
Epoch 5/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 3ms/step - MAE: 0.7281 - loss: 0.9027
Epoch 6/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 3ms/step - MAE: 0.7219 - loss: 0.8924
Epoch 7/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 3ms/step - MAE: 0.7168 - loss: 0.8840
Epoch 8/10
[1m35718/35718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - MAE: 0.7119 - loss: 0.8754
Epoch 9/10
[1m35718/35718[0m 