In [None]:
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
with open('../dataset_path.json', 'r') as f:
    paths = json.load(f)

In [None]:
raw_data_path = paths['book_crossing']['raw']
processed_data_path = paths['book_crossing']['processed']

In [None]:
item_feature_df = pd.read_csv(f'{raw_data_path}/BX_Books.csv', delimiter=';', encoding='ISO-8859-1')
item_feature_df = item_feature_df.drop(['Image-URL-M', 'Image-URL-S', 'Image-URL-L'], axis=1)

interaction = pd.read_csv(f'{raw_data_path}/BX-Book-Ratings.csv', delimiter=';', encoding='ISO-8859-1')
user_embedder = LabelEncoder()
interaction['user_id'] = user_embedder.fit_transform(interaction['User-ID'])

interaction = interaction.merge(item_feature_df, on='ISBN', how='left')

item_embedder = LabelEncoder()
interaction['item_id'] = item_embedder.fit_transform(interaction['ISBN'])
interaction.drop(['ISBN', 'User-ID'], axis=1, inplace=True)

mapp = {'user_id': 'user_id', 
        'item_id': 'item_id', 
        'Book-Title': 'title', 
        'Book-Author': 'author', 
        'Year-Of-Publication': 'year', 
        'Publisher': 'publisher', 
        'Book-Rating': 'rating'}

interaction = interaction.rename(mapp, axis=1)
interaction = interaction[['user_id', 'item_id', 'rating', 'title', 'author', 'year', 'publisher',]]

interaction['rating'] = interaction['rating'].astype(int)
interaction['year'].fillna(interaction['year'].median(), inplace=True)
interaction['year'] = interaction['year'].astype(int)

In [None]:
print(interaction['user_id'].values_count())

In [None]:
# encode item_features
str_features = ['title', 'author','publisher']
for col in str_features:
    encoder = LabelEncoder()
    interaction[col] = encoder.fit_transform(interaction[col])

interact_train, interact_test = train_test_split(interaction, test_size=0.2, random_state=42)
# interact_val, interact_test = train_test_split(interact_test, test_size=0.5, random_state=42)

In [None]:
interact_train.to_csv(f'{processed_data_path}/book_crossing_train.csv', index=False)
interact_test.to_csv(f'{processed_data_path}/book_crossing_test.csv', index=False)

In [None]:
print(interaction['user_id'].values_count())