### create no context training data from context - 25 million dataset

In [1]:
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.model_selection import train_test_split
from joblib import dump
import pandas as pd
import numpy as np
import json

In [2]:
# depth of this filr in the project
file_depth = '../..'

with open(file_depth + '/config/data_25m_config.json') as config_file:
    config = json.load(config_file)

new_context_data = config['new_context_data']
context_data = pd.read_csv(file_depth + new_context_data)
context_data

In [None]:
columns_to_drop = [
    'titleType',
    'isAdult',
    'runtimeMinutes',
    'directors',
    'actor',
    'genreAdult',
    'genreBiography',
    'genreFamily',
    'genreHistory',
    'genreImax',
    'genreMusic',
    'genreNews',
    'genreReality-tv',
    'genreShort',
    'genreSport']

movielens_context_data = context_data.drop(columns=columns_to_drop)
movielens_context_data

In [None]:
target_data = config['target_data']
target_ratings = pd.read_csv(file_depth + target_data)
target_ratings

In [None]:
movielens_context_data['ratingId'] = context_data.index
del context_data
movielens_context_data = movielens_context_data.merge(target_ratings, on='ratingId', how='left')
del target_data
movielens_context_data = movielens_context_data.drop(['ratingId'], axis=1)
movielens_context_data

In [None]:
# Label encode categorical columns
categorical_columns = ['holiday']
label_encoders = {}

for column in categorical_columns:
    if column in movielens_context_data.columns.to_list():
        print(f"Column {column} is in context_data")
        label_encoder = LabelEncoder()
        movielens_context_data[column] = label_encoder.fit_transform(movielens_context_data[column])
        label_encoders[column] = label_encoder

movielens_context_data

In [None]:
train, temp = train_test_split(movielens_context_data, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [None]:
train_target_ratings = train['rating'].values
val_target_ratings = val['rating'].values
test_target_ratings = test['rating'].values

train_data = train.drop(['rating'], axis=1)
val_data = val.drop(['rating'], axis=1)
test_data = test.drop(['rating'], axis=1)

In [None]:
# Standardize columns
scaler = MaxAbsScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.fit_transform(val_data)
test_data = scaler.fit_transform(test_data)

In [None]:
# Save the data to disk

training_data_path = config['training_data'] 
folder = 'data_25m/movielens_context_max_abs_scaler/'


dump(scaler, file_depth + training_data_path + folder + "25m_movielens_context_scaler.pkl")


for column in categorical_columns:
    try:
        if label_encoders[column] != {}:
            dump(label_encoders[column], file_depth + training_data_path + folder + f"{column}_label_encoder.pkl")
    except KeyError:
        pass

np.save(file_depth + training_data_path + folder + "train_data.npy", train_data)
np.save(file_depth + training_data_path + folder + "val_data.npy", val_data)
np.save(file_depth + training_data_path + folder + "test_data.npy", test_data)
np.save(file_depth + training_data_path + folder + "train_target_ratings.npy", train_target_ratings)
np.save(file_depth + training_data_path + folder + "val_target_ratings.npy", val_target_ratings)
np.save(file_depth + training_data_path + folder + "test_target_ratings.npy", test_target_ratings)

-----
#### Same stuff but more memory efficient

In [None]:
train, temp = train_test_split(movielens_context_data, test_size=0.2, random_state=42)
del movielens_context_data
val, test = train_test_split(temp, test_size=0.5, random_state=42)
del temp

In [None]:
test.to_csv('test_split_25ml_movielens_context.csv', index=False)

In [None]:
scaler = MaxAbsScaler()
training_data_path = config['training_data'] 
folder = 'data_25m/movielens_context_max_abs_scaler/'

In [None]:
val_target_ratings = val['rating'].values
val_data = val.drop(['rating'], axis=1)
del val
val_data = scaler.fit_transform(val_data)
np.save(file_depth + training_data_path + folder + "val_data.npy", val_data)
del val_data
np.save(file_depth + training_data_path + folder + "val_target_ratings.npy", val_target_ratings)
del val_target_ratings

In [None]:
test_target_ratings = test['rating'].values
test_data = test.drop(['rating'], axis=1)
del test
test_data = scaler.fit_transform(test_data)
np.save(file_depth + training_data_path + folder + "test_data.npy", test_data)
del test_data
np.save(file_depth + training_data_path + folder + "test_target_ratings.npy", test_target_ratings)
del test_target_ratings

In [None]:
train_target_ratings = train['rating'].values
train_data = train.drop(['rating'], axis=1)
del train
train_data = scaler.fit_transform(train_data)
np.save(file_depth + training_data_path + folder + "train_data.npy", train_data)
del train_data
np.save(file_depth + training_data_path + folder + "train_target_ratings.npy", train_target_ratings)
del train_target_ratings

In [None]:
dump(scaler, file_depth + training_data_path + folder + "25m_movielens_context_scaler.pkl")

for column in categorical_columns:
    try:
        if label_encoders[column] != {}:
            dump(label_encoders[column], file_depth + training_data_path + folder + f"{column}_label_encoder.pkl")
    except KeyError:
        pass