# Preprocessing

### Import libraries

In [63]:
import pandas as pd
import numpy as np
from scipy import sparse
import os
import csv

from utilities import pretty_print_progress, save_sparse_matrix

In [64]:
from src.Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

### Prepare original dataset

In [65]:
!rm -rf data
!mkdir -p data
!unzip data

Archive:  data.zip
  inflating: data/data_ICM_length.csv  
  inflating: data/data_ICM_type.csv  
  inflating: data/data_target_users_test.csv  
  inflating: data/interactions_and_impressions.csv  
  inflating: data/alg_sample_submission.csv  


### Read dataset

In [66]:
base_path = "data"

In [67]:
interactions_df_path = os.path.join(base_path, "interactions_and_impressions.csv")
items_length_df_path = os.path.join(base_path, "data_ICM_length.csv")
items_type_df_path = os.path.join(base_path, "data_ICM_type.csv")
users_df_path = os.path.join(base_path, "data_target_users_test.csv")

In [68]:
dtype = {0: int, 1: int, 2: str, 3: int}
interactions_df = pd.read_csv(
    filepath_or_buffer=interactions_df_path,
    dtype=dtype,
    keep_default_na=False  # avoid NaN
)

dtype = {0: int, 1: int, 2: int}
items_length_df = pd.read_csv(filepath_or_buffer=items_length_df_path, dtype=dtype)

items_types_df = pd.read_csv(filepath_or_buffer=items_type_df_path, dtype=dtype)
users_df = pd.read_csv(filepath_or_buffer=users_df_path)

### IDs mapping

In [69]:
items_ids = items_types_df["item_id"].unique()
items_ids = np.append(items_ids, interactions_df["item_id"].unique())
items_ids = np.unique(items_ids)  # do also sorting

users_ids = interactions_df["user_id"].sort_values().unique()
features_ids = items_types_df["feature_id"].sort_values().unique()

num_users = users_ids.shape[0]
num_items = items_ids.shape[0]
num_items_with_feature = items_ids.shape[0]
num_items_with_interaction = interactions_df["item_id"].unique().shape[0]
num_features = features_ids.shape[0]
num_users_to_recommend = users_df['user_id'].shape[0]

In [70]:
print("Found {} users with interactions and {} to recommend".format(
    num_users, num_users_to_recommend))
print("Found {} items, {} with interactions and {} with {} features".format(
    num_items, num_items_with_interaction, num_items_with_feature, num_features))

Found 41629 users with interactions and 41116 to recommend
Found 27968 items, 24507 with interactions and 27968 with 5 features


In [71]:
items_mapped_ids, items_original_ids = pd.factorize(items_ids)

item_mapped_id_to_original_id = pd.Series(
    items_original_ids, index=items_mapped_ids)
item_original_id_to_mapped_id = pd.Series(
    items_mapped_ids, index=items_original_ids)

In [72]:
users_mapped_ids, users_original_ids = pd.factorize(users_ids)

user_mapped_id_to_original_id = pd.Series(
    users_original_ids, index=users_mapped_ids)
user_original_id_to_mapped_id = pd.Series(
    users_mapped_ids, index=users_original_ids)

In [73]:
features_mapped_ids, features_original_ids = pd.factorize(features_ids)

feature_mapped_id_to_original_id = pd.Series(
    features_original_ids, index=features_mapped_ids)
feature_original_id_to_mapped_id = pd.Series(
    features_mapped_ids, index=features_original_ids)

### Generate URM

In [74]:
ratings_df = interactions_df.groupby(
    ['user_id', 'item_id'], as_index=False
).sum(['data'])

# Create an empty matrix (num_users x num_items) of float
URM = np.zeros((num_users, num_items), dtype=np.float16)

# Cycle through all interactions
for i in ratings_df.index:
    
    # Read item_id and user_id of each rating
    df = ratings_df
    user_id = df.loc[i, 'user_id']
    item_id = df.loc[i, 'item_id']
    
    # Get the mapped id from the original one
    user_id = user_original_id_to_mapped_id[user_id]
    item_id = item_original_id_to_mapped_id[item_id]
    
    # Set weight
    URM[user_id, item_id] = 1

    pretty_print_progress(i, df.shape[0], "Calculating URM")

# Save matrix to external file
URM = sparse.csr_matrix(URM)
save_sparse_matrix(URM, filename='urm.npz')

Calculating URM finished!                                                                           
Saved urm.npz


### Generate ICM

In [75]:
ICM = np.zeros((num_items, num_features + 2), dtype=np.int8)

for i in items_types_df.index:
    df = items_types_df
    item_id = df.loc[i, 'item_id']
    feature_id = df.loc[i, 'feature_id']
    item_id = item_original_id_to_mapped_id[item_id]
    feature_id = feature_original_id_to_mapped_id[feature_id]
    ICM[item_id, feature_id] = 1

    pretty_print_progress(i, df.shape[0], "Calculating ICM with types")

for i in items_length_df.index:
    df = items_length_df
    item_id = df.loc[i, 'item_id']
    length = df.loc[i, 'data']
    item_id = item_original_id_to_mapped_id[item_id]

    if length == 0:
        continue
    elif length == 1:
        feature_id = num_features
    else:
        feature_id = num_features + 1

    ICM[item_id, feature_id] = 1

    pretty_print_progress(i, df.shape[0], "Calculating ICM with items length")

# Save matrix to external file
ICM = sparse.csr_matrix(ICM)
save_sparse_matrix(ICM, "icm.npz")

Calculating ICM with types finished!                                                                
Calculating ICM with items length finished!                                                         
Saved icm.npz


### Generate splitted URM

In [76]:
URM_train_val, URM_test = split_train_in_two_percentage_global_sample(URM, train_percentage = 0.8)
URM_train, URM_val = split_train_in_two_percentage_global_sample(URM_train_val, train_percentage = 0.8)



In [77]:
save_sparse_matrix(URM_test, "urm_test.npz")
save_sparse_matrix(URM_train_val, "urm_train_val.npz")
save_sparse_matrix(URM_train, "urm_train.npz")
save_sparse_matrix(URM_val, "urm_val.npz")

Saved urm_test.npz
Saved urm_train_val.npz
Saved urm_train.npz
Saved urm_val.npz
