## 1 ml - Split Added IMDb context

In [1]:
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.model_selection import train_test_split
from joblib import dump
import pandas as pd
import numpy as np
import json

In [2]:
# depth of this file in the project
file_depth = '../..'

In [3]:
with open(file_depth + '/config/data_1m_config.json') as config_file:
    config = json.load(config_file)

In [4]:
dataset_path = config['dataset_path']
added_imdb_context = pd.read_csv(file_depth + dataset_path + 'added_imdb_context/added_imdb_context.csv')
added_imdb_context

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreMystery,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern
0,1,296,3,1,1,3,no_holiday,1994,movie,0,...,0,0,0,0,0,0,0,1,0,0
1,1,306,3,1,1,2,no_holiday,1994,movie,0,...,0,0,0,0,0,0,0,0,0,0
2,1,307,3,1,1,2,no_holiday,1993,movie,0,...,0,0,0,0,0,0,0,0,0,0
3,1,665,3,1,1,3,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,1,0
4,1,899,3,1,1,2,no_holiday,1952,movie,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530334,6040,3917,2,1,4,4,no_holiday,1987,movie,0,...,0,0,0,0,0,0,0,0,0,0
530335,6040,3932,2,1,4,4,no_holiday,1933,movie,0,...,0,0,0,0,1,0,0,0,0,0
530336,6040,3948,3,1,4,1,no_holiday,2000,movie,0,...,0,0,0,0,0,0,0,0,0,0
530337,6040,3951,6,0,3,1,no_holiday,2000,movie,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
ratings_target = pd.read_csv(file_depth + dataset_path + 'added_imdb_context/ratings_target.csv')
ratings_target

Unnamed: 0,ratingId,rating
0,0,5.0
1,1,3.5
2,2,5.0
3,3,5.0
4,4,3.5
...,...,...
530334,530334,4.0
530335,530335,4.0
530336,530336,4.0
530337,530337,4.0


In [6]:
added_imdb_context['ratingId'] = added_imdb_context.index
added_imdb_context = added_imdb_context.merge(ratings_target, on='ratingId', how='left')
added_imdb_context = added_imdb_context.drop(['ratingId'], axis=1)
added_imdb_context

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,1,296,3,1,1,3,no_holiday,1994,movie,0,...,0,0,0,0,0,0,1,0,0,5.0
1,1,306,3,1,1,2,no_holiday,1994,movie,0,...,0,0,0,0,0,0,0,0,0,3.5
2,1,307,3,1,1,2,no_holiday,1993,movie,0,...,0,0,0,0,0,0,0,0,0,5.0
3,1,665,3,1,1,3,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,1,0,5.0
4,1,899,3,1,1,2,no_holiday,1952,movie,0,...,0,0,1,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530334,6040,3917,2,1,4,4,no_holiday,1987,movie,0,...,0,0,0,0,0,0,0,0,0,4.0
530335,6040,3932,2,1,4,4,no_holiday,1933,movie,0,...,0,0,0,1,0,0,0,0,0,4.0
530336,6040,3948,3,1,4,1,no_holiday,2000,movie,0,...,0,0,0,0,0,0,0,0,0,4.0
530337,6040,3951,6,0,3,1,no_holiday,2000,movie,0,...,0,0,0,0,0,0,0,0,0,4.0


In [7]:
# Label encode categorical columns
categorical_columns = ['holiday', 'titleType', 'directors', 'actor']
label_encoders = {}

for column in categorical_columns:
    if column in added_imdb_context.columns.to_list():
        print(f"Column {column} is in context_data")
        label_encoder = LabelEncoder()
        added_imdb_context[column] = label_encoder.fit_transform(added_imdb_context[column])
        label_encoders[column] = label_encoder

added_imdb_context

Column holiday is in context_data
Column titleType is in context_data
Column directors is in context_data
Column actor is in context_data


Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,1,296,3,1,1,3,2,1994,0,0,...,0,0,0,0,0,0,1,0,0,5.0
1,1,306,3,1,1,2,2,1994,0,0,...,0,0,0,0,0,0,0,0,0,3.5
2,1,307,3,1,1,2,2,1993,0,0,...,0,0,0,0,0,0,0,0,0,5.0
3,1,665,3,1,1,3,2,1995,0,0,...,0,0,0,0,0,0,0,1,0,5.0
4,1,899,3,1,1,2,2,1952,0,0,...,0,0,1,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530334,6040,3917,2,1,4,4,2,1987,0,0,...,0,0,0,0,0,0,0,0,0,4.0
530335,6040,3932,2,1,4,4,2,1933,0,0,...,0,0,0,1,0,0,0,0,0,4.0
530336,6040,3948,3,1,4,1,2,2000,0,0,...,0,0,0,0,0,0,0,0,0,4.0
530337,6040,3951,6,0,3,1,2,2000,0,0,...,0,0,0,0,0,0,0,0,0,4.0


In [8]:
train, temp = train_test_split(added_imdb_context, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [9]:
train_target_ratings = train['rating'].values
val_target_ratings = val['rating'].values
test_target_ratings = test['rating'].values

train_data = train.drop(['rating'], axis=1)
val_data = val.drop(['rating'], axis=1)
test_data = test.drop(['rating'], axis=1)

In [10]:
# Standardize columns
scaler = MaxAbsScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.fit_transform(val_data)
test_data = scaler.fit_transform(test_data)

In [11]:
# Save the data to disk

training_data_path = config['training_data'] 
folder = 'data_1m/added_imdb_context_max_abs_scaler/'


dump(scaler, file_depth + training_data_path + folder + "1m_added_imdb_context_scaler.pkl")


for column in categorical_columns:
    try:
        if label_encoders[column] != {}:
            dump(label_encoders[column], file_depth + training_data_path + folder + f"{column}_label_encoder.pkl")
    except KeyError:
        pass

np.save(file_depth + training_data_path + folder + "train_data.npy", train_data)
np.save(file_depth + training_data_path + folder + "val_data.npy", val_data)
np.save(file_depth + training_data_path + folder + "test_data.npy", test_data)
np.save(file_depth + training_data_path + folder + "train_target_ratings.npy", train_target_ratings)
np.save(file_depth + training_data_path + folder + "val_target_ratings.npy", val_target_ratings)
np.save(file_depth + training_data_path + folder + "test_target_ratings.npy", test_target_ratings)