In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


In [None]:

from sklearn import metrics, preprocessing
from sklearm.metrics import mean_squared_error, mean_absolute_error

from tensorflow.keras import models, layers, utils, optimizers, callbacks

In [None]:
movies_df = pd.read_csv('data/movies.dat', sep='::', engine='python', names=['movieId', 'title', 'genres'],encoding='latin1')
users_df = pd.read_csv('data/users.dat', sep='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip-code'])
ratings_df = pd.read_csv('data/ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

In [None]:
movies_df = movies_df[~movies_df["genres"].isna()]
movies_df["product"] = range(0, len(movies_df))
movies_df["name"] = movies_df["title"].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x).strip())
movies_df["date"] = movies_df["title"].apply(lambda x: int(x.split("(")[-1].replace(")", "").strip()) if "(" in x else np.nan)
movies_df["date"] = movies_df["date"].fillna(9999)
movies_df["old"] = movies_df["date"].apply(lambda x: 1 if x < 2000 else 0)


In [None]:
ratings_df["user"] = ratings_df["userId"].apply(lambda x: x-1)
ratings_df["timestamp"] = ratings_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x))
ratings_df["daytime"] = ratings_df["timestamp"].apply(lambda x: 1 if 6 < int(x.strftime("%H")) < 20 else 0)
ratings_df["weekend"] = ratings_df["timestamp"].apply(lambda x: 1 if x.weekday() in [5, 6] else 0)
ratings_df = ratings_df.merge(movies_df[["movieId", "product"]], how="left")
ratings_df = ratings_df.rename(columns={"rating": "y"})


In [None]:
movies_df = movies_df[["product","name","old","genres"]].set_index("product")
ratings_df = ratings_df[["user","product","daytime","weekend","y"]]

In [None]:
context_df = ratings_df[["user","product","daytime","weekend"]]

In [None]:
tags = [i.split("|") for i in movies_df["genres"].unique()]
columns = list(set([i for lst in tags for i in lst]))

if '(no genres listed)' in columns:
    columns.remove('(no genres listed)')

for col in columns:
    movies_df[col] = movies_df["genres"].apply(lambda x: 1 if col in x else 0)



In [None]:
fig, ax = plt.subplots(figsize=(20,5))
sns.heatmap(movies_df==0, vmin=0, vmax=1, cbar=False, ax=ax).set_title("Movies x Features")
plt.show()

In [None]:
tmp = ratings_df.copy()
ratings_df = tmp.pivot_table(index="user", columns="product", values="y")
missing_cols = list(set(ratings_df.index) - set(ratings_df.columns))
for col in missing_cols:
    ratings_df[col] = np.nan
ratings_df = ratings_df[sorted(ratings_df.columns)]

In [None]:
ratings_df = pd.DataFrame(preprocessing.MinMaxScaler(feature_range=(0.5,1)).fit_transform(ratings_df.values), 
columns=ratings_df.columns, index=ratings_df.index)

In [None]:
split = int(0.8*ratings_df.shape[1])
train_df = ratings_df.loc[:, :split-1]
test_df = ratings_df.loc[:, split:]

In [None]:
features = movies_df.drop(["genres","name"], axis=1).columns
print(features)
context = context_df.drop(["user","product"], axis=1).columns
print(context)

In [None]:
train = train_df.stack(dropna=True).reset_index().rename(columns={0:"y"})
## add features
train = train.merge(movies_df[features], how="left", left_on="product", right_index=True)
## add context
train = train.merge(context_df, how="left")

In [None]:
test = test_df.stack(dropna=True).reset_index().rename(columns={0:"y"})
## add features
test = test.merge(movies_df[features], how="left", left_on="product", right_index=True)
## add context
test = test.merge(context_df, how="left")

In [None]:
embeddings_size = 50
usr, prd = ratings_df.shape[0], ratings_df.shape[1]
feat = len(features)
ctx = len(context)

## Collaborative Filtering

In [None]:
# Input layer
xusers_in = layers.Input(name="xusers_in", shape=(1,))
xproducts_in = layers.Input(name="xproducts_in", shape=(1,))# A) Matrix Factorization
## embeddings and reshape
cf_xusers_emb = layers.Embedding(name="cf_xusers_emb", input_dim=usr, output_dim=embeddings_size)(xusers_in)
cf_xusers = layers.Reshape(name='cf_xusers', target_shape=(embeddings_size,))(cf_xusers_emb)## embeddings and reshape
cf_xproducts_emb = layers.Embedding(name="cf_xproducts_emb", input_dim=prd, output_dim=embeddings_size)(xproducts_in)
cf_xproducts = layers.Reshape(name='cf_xproducts', target_shape=(embeddings_size,))(cf_xproducts_emb)## product
cf_xx = layers.Dot(name='cf_xx', normalize=True, axes=1)([cf_xusers, cf_xproducts])# B) Neural Network
## embeddings and reshape
nn_xusers_emb = layers.Embedding(name="nn_xusers_emb", input_dim=usr, output_dim=embeddings_size)(xusers_in)
nn_xusers = layers.Reshape(name='nn_xusers', target_shape=(embeddings_size,))(nn_xusers_emb)## embeddings and reshape
nn_xproducts_emb = layers.Embedding(name="nn_xproducts_emb", input_dim=prd, output_dim=embeddings_size)(xproducts_in)
nn_xproducts = layers.Reshape(name='nn_xproducts', target_shape=(embeddings_size,))(nn_xproducts_emb)## concat and dense
nn_xx = layers.Concatenate()([nn_xusers, nn_xproducts])
nn_xx = layers.Dense(name="nn_xx", units=int(embeddings_size/2), activation='relu')(nn_xx)

## Content Based

In [None]:
# Product Features
features_in = layers.Input(name="features_in", shape=(feat,))
features_x = layers.Dense(name="features_x", units=feat, activation='relu')(features_in)

## Knowledge Based

In [None]:
# Context
contexts_in = layers.Input(name="contexts_in", shape=(ctx,))
context_x = layers.Dense(name="context_x", units=ctx, activation='relu')(contexts_in)

## Output

In [None]:
# Merge all
y_out = layers.Concatenate()([cf_xx, nn_xx, features_x, context_x])
y_out = layers.Dense(name="y_out", units=1, activation='linear')(y_out)

In [None]:
# Compile
model = models.Model(inputs=[xusers_in,xproducts_in, features_in, contexts_in], outputs=y_out, name="Hybrid_Model")
model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_absolute_percentage_error'])

In [None]:
# Train
training = model.fit(x=[train["user"], train["product"], train[features], train[context]], y=train["y"], epochs=100, batch_size=128, shuffle=True, verbose=0, validation_split=0.3)

model = training.model

In [None]:
# Test
test["yhat"] = model.predict([test["user"], test["product"], test[features], test[context]])

In [None]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(test['y'], test['yhat'])
print(f'Mean Absolute Error: {mae:.4f}')

# Calculate Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(test['y'], test['yhat']))
print(f'Root Mean Squared Error: {rmse:.4f}')
