# Preprocessing

### Import libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import os
import csv

from utilities import pretty_print_progress, save_sparse_matrix

### Read dataset

In [2]:
base_path = "data"

In [3]:
interactions_df_path = os.path.join(base_path, "interactions_and_impressions.csv")
items_length_df_path = os.path.join(base_path, "data_ICM_length.csv")
items_type_df_path = os.path.join(base_path, "data_ICM_type.csv")
users_df_path = os.path.join(base_path, "data_target_users_test.csv")

In [None]:
dtype = {0: int, 1: int, 2: str, 3: int}
interactions_df = pd.read_csv(
    filepath_or_buffer=interactions_df_path,
    dtype=dtype,
    keep_default_na=False  # avoid NaN
)

dtype = {0: int, 1: int, 2: int}
items_length_df = pd.read_csv(filepath_or_buffer=items_length_df_path, dtype=dtype)

items_types_df = pd.read_csv(filepath_or_buffer=items_type_df_path, dtype=dtype)
users_df = pd.read_csv(filepath_or_buffer=users_df_path)

### IDs mapping

In [None]:
items_ids = items_types_df["item_id"].unique()
items_ids = np.append(items_ids, interactions_df["item_id"].unique())
items_ids = np.unique(items_ids)  # do also sorting

users_ids = interactions_df["user_id"].sort_values().unique()
features_ids = items_types_df["feature_id"].sort_values().unique()

num_users = users_ids.shape[0]
num_items = items_ids.shape[0]
num_items_with_feature = items_ids.shape[0]
num_items_with_interaction = interactions_df["item_id"].unique().shape[0]
num_features = features_ids.shape[0]
num_users_to_recommend = users_df['user_id'].shape[0]

In [None]:
print("Found {} users with interactions and {} to recommend".format(
    num_users, num_users_to_recommend))
print("Found {} items, {} with interactions and {} with {} features".format(
    num_items, num_items_with_interaction, num_items_with_feature, num_features))

In [None]:
items_mapped_ids, items_original_ids = pd.factorize(items_ids)

item_mapped_id_to_original_id = pd.Series(
    items_original_ids, index=items_mapped_ids)
item_original_id_to_mapped_id = pd.Series(
    items_mapped_ids, index=items_original_ids)

In [None]:
users_mapped_ids, users_original_ids = pd.factorize(users_ids)

user_mapped_id_to_original_id = pd.Series(
    users_original_ids, index=users_mapped_ids)
user_original_id_to_mapped_id = pd.Series(
    users_mapped_ids, index=users_original_ids)

In [None]:
features_mapped_ids, features_original_ids = pd.factorize(features_ids)

feature_mapped_id_to_original_id = pd.Series(
    features_original_ids, index=features_mapped_ids)
feature_original_id_to_mapped_id = pd.Series(
    features_mapped_ids, index=features_original_ids)

### Generate URM

In [None]:
df = interactions_df.copy()

ratings_df = df.groupby(
    ['user_id', 'item_id'], as_index=False
).sum(['data'])

URM = np.zeros((num_users, num_items), dtype=np.float16)

for user_mapped_id in range(num_users):
    df = ratings_df
    user_original_id = user_mapped_id_to_original_id[user_mapped_id]
    user_items = df[df['user_id'] == user_original_id]

    for i in user_items.index:
        item_original_id = user_items.loc[i, 'item_id']
        item_mapped_id = item_original_id_to_mapped_id[item_original_id]
        URM[user_mapped_id, item_mapped_id] = 1

    pretty_print_progress(
        user_mapped_id, num_users, "Calculating URM")

save_sparse_matrix(URM, "./data", 'urm.npz')

### Generate ICM

In [None]:
ICM = np.zeros((num_items, num_features + 2), dtype=np.int8)

for i in df.index:

    item_id = df.loc[i, 'item_id']
    feature_id = df.loc[i, 'feature_id']
    item_id = item_original_id_to_mapped_id[item_id]
    feature_id = feature_original_id_to_mapped_id[feature_id]
    ICM[item_id, feature_id] = 1

    pretty_print_progress(
        i, df.shape[0], "Calculating ICM with types")

df = items_length_df

for i in df.index:

    item_id = df.loc[i, 'item_id']
    length = df.loc[i, 'data']
    item_id = item_original_id_to_mapped_id[item_id]

    if length == 0:
        continue
    elif length == 1:
        feature_id = num_features
    else:
        feature_id = num_features + 1

    ICM[item_id, feature_id] = 1

    pretty_print_progress(
        i, df.shape[0], "Calculating ICM with items length")

save_sparse_matrix(ICM, "./data", "icm.npz")