## 1 ml - Split MovieLens context

In [1]:
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.model_selection import train_test_split
from joblib import dump
import pandas as pd
import numpy as np
import json

In [2]:
# depth of this file in the project
file_depth = '../..'

In [3]:
with open(file_depth + '/config/data_1m_config.json') as config_file:
    config = json.load(config_file)

new_context_path = config['new_context_path']

In [4]:
context_data = pd.read_csv(file_depth + new_context_path + 'context_data.csv')
context_data

Unnamed: 0,user_id,movie_id,day,is_weekday,season,part_of_day,holiday,user_gender,user_age,user_occupation,...,genre_film-noir,genre_forchildren,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_thriller,genre_war,genre_western
0,1,1193,7,0,4,4,new_years,F,1,10,...,0,0,0,0,0,0,0,0,0,0
1,1,661,7,0,4,4,new_years,F,1,10,...,0,1,0,1,0,0,0,0,0,0
2,1,914,7,0,4,4,new_years,F,1,10,...,0,0,0,1,0,1,0,0,0,0
3,1,3408,7,0,4,4,new_years,F,1,10,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,7,0,4,4,no_holiday,F,1,10,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,3,1,1,4,no_holiday,M,25,6,...,0,0,0,0,0,0,0,0,0,0
1000205,6040,1094,3,1,1,4,no_holiday,M,25,6,...,0,0,0,0,0,1,0,0,1,0
1000206,6040,562,3,1,1,4,no_holiday,M,25,6,...,0,0,0,0,0,0,0,0,0,0
1000207,6040,1096,3,1,1,4,no_holiday,M,25,6,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Label encode categorical columns
categorical_columns = ['holiday', 'user_gender']
label_encoders = {}

for column in categorical_columns:
    if column in context_data.columns.to_list():
        print(f"Column {column} is in context_data")
        label_encoder = LabelEncoder()
        context_data[column] = label_encoder.fit_transform(context_data[column])
        label_encoders[column] = label_encoder

context_data

Column holiday is in context_data
Column user_gender is in context_data


Unnamed: 0,user_id,movie_id,day,is_weekday,season,part_of_day,holiday,user_gender,user_age,user_occupation,...,genre_film-noir,genre_forchildren,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_thriller,genre_war,genre_western
0,1,1193,7,0,4,4,1,0,1,10,...,0,0,0,0,0,0,0,0,0,0
1,1,661,7,0,4,4,1,0,1,10,...,0,1,0,1,0,0,0,0,0,0
2,1,914,7,0,4,4,1,0,1,10,...,0,0,0,1,0,1,0,0,0,0
3,1,3408,7,0,4,4,1,0,1,10,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,7,0,4,4,2,0,1,10,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,3,1,1,4,2,1,25,6,...,0,0,0,0,0,0,0,0,0,0
1000205,6040,1094,3,1,1,4,2,1,25,6,...,0,0,0,0,0,1,0,0,1,0
1000206,6040,562,3,1,1,4,2,1,25,6,...,0,0,0,0,0,0,0,0,0,0
1000207,6040,1096,3,1,1,4,2,1,25,6,...,0,0,0,0,0,0,0,0,0,0


---------------------

### Split data and save then

Load target (ratings)

In [6]:
target_data_path = config['target_data']
target_ratings = pd.read_csv(file_depth + target_data_path)
target_ratings

Unnamed: 0,rating_id,rating
0,0,5
1,1,3
2,2,3
3,3,4
4,4,5
...,...,...
1000204,1000204,1
1000205,1000205,5
1000206,1000206,5
1000207,1000207,4


Merge ratings with context_data

In [7]:
context_data['rating_id'] = context_data.index
context_data = context_data.merge(target_ratings, on='rating_id', how='left')
context_data = context_data.drop(['rating_id'], axis=1)
context_data

Unnamed: 0,user_id,movie_id,day,is_weekday,season,part_of_day,holiday,user_gender,user_age,user_occupation,...,genre_forchildren,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_thriller,genre_war,genre_western,rating
0,1,1193,7,0,4,4,1,0,1,10,...,0,0,0,0,0,0,0,0,0,5
1,1,661,7,0,4,4,1,0,1,10,...,1,0,1,0,0,0,0,0,0,3
2,1,914,7,0,4,4,1,0,1,10,...,0,0,1,0,1,0,0,0,0,3
3,1,3408,7,0,4,4,1,0,1,10,...,0,0,0,0,0,0,0,0,0,4
4,1,2355,7,0,4,4,2,0,1,10,...,1,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,3,1,1,4,2,1,25,6,...,0,0,0,0,0,0,0,0,0,1
1000205,6040,1094,3,1,1,4,2,1,25,6,...,0,0,0,0,1,0,0,1,0,5
1000206,6040,562,3,1,1,4,2,1,25,6,...,0,0,0,0,0,0,0,0,0,5
1000207,6040,1096,3,1,1,4,2,1,25,6,...,0,0,0,0,0,0,0,0,0,4


In [8]:
train, temp = train_test_split(context_data, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [9]:
train_target_ratings = train['rating'].values
val_target_ratings = val['rating'].values
test_target_ratings = test['rating'].values

train_data = train.drop(['rating'], axis=1)
val_data = val.drop(['rating'], axis=1)
test_data = test.drop(['rating'], axis=1)

In [10]:
# Standardize columns
scaler = MaxAbsScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.fit_transform(val_data)
test_data = scaler.fit_transform(test_data)

In [11]:
# Save the data to disk

training_data_path = config['training_data'] 
folder = 'data_1m/movielens_context_max_abs_scaler/'


dump(scaler, file_depth + training_data_path + folder + "1m_movielens_context_scaler.pkl")


for column in categorical_columns:
    try:
        if label_encoders[column] != {}:
            dump(label_encoders[column], file_depth + training_data_path + folder + f"{column}_label_encoder.pkl")
    except KeyError:
        pass

np.save(file_depth + training_data_path + folder + "train_data.npy", train_data)
np.save(file_depth + training_data_path + folder + "val_data.npy", val_data)
np.save(file_depth + training_data_path + folder + "test_data.npy", test_data)
np.save(file_depth + training_data_path + folder + "train_target_ratings.npy", train_target_ratings)
np.save(file_depth + training_data_path + folder + "val_target_ratings.npy", val_target_ratings)
np.save(file_depth + training_data_path + folder + "test_target_ratings.npy", test_target_ratings)