In [None]:
%pip install sklearn

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


In [None]:

from sklearn import metrics, preprocessing

from tensorflow.keras import models, layers, utils, optimizers, callbacks

In [None]:
movies_df = pd.read_csv('data/movies.dat', sep='::', engine='python', names=['movieId', 'title', 'genres'],encoding='latin1')
users_df = pd.read_csv('data/users.dat', sep='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip-code'])
ratings_df = pd.read_csv('data/ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

In [None]:
movies_df = movies_df[~movies_df["genres"].isna()]
movies_df["product"] = range(0, len(movies_df))
movies_df["name"] = movies_df["title"].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x).strip())
movies_df["date"] = movies_df["title"].apply(lambda x: int(x.split("(")[-1].replace(")", "").strip()) if "(" in x else np.nan)
movies_df["date"] = movies_df["date"].fillna(9999)
movies_df["old"] = movies_df["date"].apply(lambda x: 1 if x < 2000 else 0)


In [None]:
ratings_df["user"] = ratings_df["userId"].apply(lambda x: x-1)
ratings_df["timestamp"] = ratings_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x))
ratings_df["daytime"] = ratings_df["timestamp"].apply(lambda x: 1 if 6 < int(x.strftime("%H")) < 20 else 0)
ratings_df["weekend"] = ratings_df["timestamp"].apply(lambda x: 1 if x.weekday() in [5, 6] else 0)
ratings_df = ratings_df.merge(movies_df[["movieId", "product"]], how="left")
ratings_df = ratings_df.rename(columns={"rating": "y"})


In [None]:
movies_df = movies_df[["product","name","old","genres"]].set_index("product")
ratings_df = ratings_df[["user","product","daytime","weekend","y"]]

In [None]:
context_df = ratings_df[["user","product","daytime","weekend"]]

In [None]:
tags = [i.split("|") for i in movies_df["genres"].unique()]
columns = list(set([i for lst in tags for i in lst]))

if '(no genres listed)' in columns:
    columns.remove('(no genres listed)')

for col in columns:
    movies_df[col] = movies_df["genres"].apply(lambda x: 1 if col in x else 0)



In [None]:
fig, ax = plt.subplots(figsize=(20,5))
sns.heatmap(movies_df==0, vmin=0, vmax=1, cbar=False, ax=ax).set_title("Movies x Features")
plt.show()

In [None]:
tmp = ratings_df.copy()
ratings_df = tmp.pivot_table(index="user", columns="product", values="y")
missing_cols = list(set(ratings_df.index) - set(ratings_df.columns))
for col in missing_cols:
    ratings_df[col] = np.nan
dtf_users = ratings_df[sorted(ratings_df.columns)]

In [None]:
ratings_df = pd.DataFrame(preprocessing.MinMaxScaler(feature_range=(0.5,1)).fit_transform(ratings_df.values), 
columns=ratings_df.columns, index=ratings_df.index)

In [None]:
split = int(0.8*ratings_df.shape[1])
train_df = ratings_df.loc[:, :split-1]
test_df = ratings_df.loc[:, split:]