### Create data for user profiles

In [1]:
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.model_selection import train_test_split
from joblib import dump
import pandas as pd
import numpy as np
import json

In [2]:
# depth of this file in the project
file_depth = '../..'

with open(file_depth + '/config/data_25m_config.json') as config_file:
    config = json.load(config_file)

In [3]:
top_rating_users = pd.read_csv(file_depth + '/recommender/eval/user_ids_in_train_test_split_25ml.csv')
top_rating_users = top_rating_users.sort_values(by='trainCount', ascending=False)
top_rating_users

Unnamed: 0,userId,trainCount,testCount
57,72315,25518,3312
35,80974,7392,898
34,137293,7110,929
23,33844,6353,847
41,20055,5955,739
14,109731,5322,667
7,92046,5190,694
50,49403,5169,690
12,30879,4582,545
52,110971,4532,539


In [4]:
user_ids = [72315, 80974, 107650]

print(f"User ID 1: {user_ids[0]} (trainCount: {top_rating_users[top_rating_users['userId'] == user_ids[0]]['trainCount'].values[0]} testCount: {top_rating_users[top_rating_users['userId'] == user_ids[0]]['testCount'].values[0]})")
print(f"User ID 2: {user_ids[1]} (trainCount: {top_rating_users[top_rating_users['userId'] == user_ids[1]]['trainCount'].values[0]} testCount: {top_rating_users[top_rating_users['userId'] == user_ids[1]]['testCount'].values[0]})")
print(f"User ID 3: {user_ids[2]} (trainCount: {top_rating_users[top_rating_users['userId'] == user_ids[2]]['trainCount'].values[0]} testCount: {top_rating_users[top_rating_users['userId'] == user_ids[2]]['testCount'].values[0]})")

del top_rating_users

User ID 1: 72315 (trainCount: 25518 testCount: 3312)
User ID 2: 80974 (trainCount: 7392 testCount: 898)
User ID 3: 107650 (trainCount: 3440 testCount: 478)


In [5]:
added_imdb_context = pd.read_csv(file_depth + '/dataset/ml-25m/new_context/added_imdb_context_with_ratings.csv')
added_imdb_context

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,1,296,3,1,1,3,no_holiday,1994,movie,0,...,0,0,0,0,0,0,1,0,0,5.0
1,1,306,3,1,1,2,no_holiday,1994,movie,0,...,0,0,0,0,0,0,0,0,0,3.5
2,1,307,3,1,1,2,no_holiday,1993,movie,0,...,0,0,0,0,0,0,0,0,0,5.0
3,1,665,3,1,1,3,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,1,0,5.0
4,1,899,3,1,1,2,no_holiday,1952,movie,0,...,0,0,1,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24983464,162541,50872,2,1,1,4,no_holiday,2007,movie,0,...,0,0,0,0,0,0,0,0,0,4.5
24983465,162541,55768,2,1,1,4,no_holiday,2007,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
24983466,162541,56176,2,1,1,4,no_holiday,2007,movie,0,...,0,0,0,0,0,0,0,0,0,2.0
24983467,162541,58559,2,1,1,4,no_holiday,2008,movie,0,...,0,0,0,0,0,0,0,0,0,4.0


In [6]:
user_ratings_1 = added_imdb_context[added_imdb_context['userId'] == user_ids[0]]
user_ratings_1

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
11111801,72315,1,2,1,4,1,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,4.0
11111802,72315,2,3,1,3,1,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,3.0
11111803,72315,5,4,1,2,1,summer_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
11111804,72315,6,2,1,4,1,no_holiday,1995,movie,0,...,0,0,0,0,0,0,1,0,0,4.0
11111805,72315,7,4,1,2,1,summer_holiday,1995,movie,0,...,0,0,1,0,0,0,0,0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11143968,72315,206795,5,1,3,1,no_holiday,2019,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
11143969,72315,206881,5,1,3,1,no_holiday,2006,tvEpisode,0,...,0,0,0,0,0,0,0,0,0,2.5
11143970,72315,206947,5,1,3,1,no_holiday,2018,movie,0,...,0,0,1,0,0,0,0,0,0,2.0
11143971,72315,206949,5,1,3,1,no_holiday,2019,movie,0,...,0,0,0,0,0,0,0,0,0,3.5


In [7]:
user_ratings_2 = added_imdb_context[added_imdb_context['userId'] == user_ids[1]]
user_ratings_2

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
12505190,80974,1,7,0,2,4,summer_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,4.0
12505191,80974,2,2,1,2,4,summer_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,4.0
12505192,80974,3,2,1,2,4,summer_holiday,1995,movie,0,...,0,0,1,0,0,0,0,0,0,3.0
12505193,80974,4,2,1,2,4,summer_holiday,1995,movie,0,...,0,0,1,0,0,0,0,0,0,3.0
12505194,80974,5,2,1,2,4,summer_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12514357,80974,129514,3,1,1,4,no_holiday,2008,tvSpecial,0,...,0,0,0,0,0,0,0,0,0,2.5
12514358,80974,135224,4,1,4,4,christmas,2015,movie,0,...,0,0,1,0,0,0,0,0,0,0.5
12514359,80974,135955,6,0,2,3,summer_holiday,1947,movie,0,...,0,0,0,0,0,0,0,0,1,3.0
12514360,80974,136542,6,0,2,3,summer_holiday,1983,short,0,...,0,0,0,0,0,0,0,0,0,4.0


In [8]:
user_ratings_3 = added_imdb_context[added_imdb_context['userId'] == user_ids[2]]
del added_imdb_context
user_ratings_3

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
16581372,107650,1,7,0,3,3,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
16581373,107650,2,2,1,3,3,no_holiday,1995,movie,0,...,0,0,0,0,0,0,0,0,0,3.5
16581374,107650,3,7,0,4,3,no_holiday,1995,movie,0,...,0,0,1,0,0,0,0,0,0,3.0
16581375,107650,4,6,0,4,4,no_holiday,1995,movie,0,...,0,0,1,0,0,0,0,0,0,2.0
16581376,107650,5,1,1,4,4,christmas,1995,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16585717,107650,201646,6,0,3,2,no_holiday,2019,movie,0,...,0,0,0,0,0,0,0,0,0,4.5
16585718,107650,201811,6,0,3,2,no_holiday,2019,movie,0,...,0,0,1,0,0,0,0,0,0,3.5
16585719,107650,205383,6,0,3,2,no_holiday,2019,movie,0,...,0,0,0,0,0,0,1,0,0,3.0
16585720,107650,206093,1,1,3,2,no_holiday,2019,movie,0,...,0,0,0,0,0,0,0,0,0,2.0


In [9]:
users = [user_ratings_1, user_ratings_2, user_ratings_3]

In [11]:
for i, user_ratings in enumerate(users):

    categorical_columns = ['holiday', 'titleType', 'directors', 'actor']
    label_encoders = {}

    for column in categorical_columns:
        if column in user_ratings.columns.to_list():
            label_encoder = LabelEncoder()
            user_ratings[column] = label_encoder.fit_transform(user_ratings[column])
            label_encoders[column] = label_encoder

    train, temp = train_test_split(user_ratings, test_size=0.2, random_state=42)
    val, test = train_test_split(temp, test_size=0.5, random_state=42)
    del temp

    train_target_ratings = train['rating'].values
    val_target_ratings = val['rating'].values
    test_target_ratings = test['rating'].values

    train_data = train.drop(['rating'], axis=1)
    val_data = val.drop(['rating'], axis=1)
    test_data = test.drop(['rating'], axis=1)

    scaler = MaxAbsScaler()

    train_data = scaler.fit_transform(train_data)
    val_data = scaler.fit_transform(val_data)
    test_data = scaler.fit_transform(test_data)

    dump(scaler, f"profile_{i+1}/scaler_profile_{i+1}.pkl")

    for column in categorical_columns:
        try:
            if label_encoders[column] != {}:
                dump(label_encoders[column], f"profile_{i+1}/{column}_label_encoder.pkl")
        except KeyError:
            pass

    test.to_csv(f"profile_{i+1}/test_split_profile_{i+1}.csv", index=False)
    np.save(f"profile_{i+1}/train_data.npy", train_data)
    np.save(f"profile_{i+1}/val_data.npy", val_data)
    np.save(f"profile_{i+1}/test_data.npy", test_data)
    np.save(f"profile_{i+1}/train_target_ratings.npy", train_target_ratings)
    np.save(f"profile_{i+1}/val_target_ratings.npy", val_target_ratings)
    np.save(f"profile_{i+1}/test_target_ratings.npy", test_target_ratings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings[column] = label_encoder.fit_transform(user_ratings[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings[column] = label_encoder.fit_transform(user_ratings[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings[column] = label_encoder.fit_transform(user_rat