In [13]:
import pickle
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset

data_raw_dir = '../data/raw/'
data_interim_dir = '../data/interim/'
data_benchmark_dir = '../benchmark/data/'
data_filenames = [f'u{t}.{split}' for t in ['1', '2', '3', '4', '5', 'a', 'b'] for split in ['base', 'test']]
data_filenames
['u1.base',
 'u1.test',
 'u2.base',
 'u2.test',
 'u3.base',
 'u3.test',
 'u4.base',
 'u4.test',
 'u5.base',
 'u5.test',
 'ua.base',
 'ua.test',
 'ub.base',
 'ub.test']

user_features_names = [Columns.User, 'age', 'gender', 'occupation', 'zip_code']

user_features = pd.read_csv(
    data_raw_dir + 'u.user',
    sep='|',
    names=user_features_names,
)
user_features.drop('zip_code', axis=1, inplace=True)
user_features.age = user_features.age / user_features.age.max()
final_user_features_names = user_features.drop(Columns.User, axis=1).columns.to_list()

genres = pd.read_csv(
    data_raw_dir + 'u.genre',
    sep='|',
    names=['genre', 'genre_id'],
)
genre_names = genres.genre.unique()
item_dates = ['release_date', 'video_release_date']
item_features_names = ['id', 'title', *item_dates, 'IMDB_URL', *genre_names]

item_features = pd.read_csv(
    data_raw_dir + 'u.item',
    encoding='latin-1',
    sep='|',
    names=item_features_names,
    parse_dates=item_dates,
)
item_features.drop(['title', *item_dates, 'IMDB_URL'], axis=1, inplace=True)
final_item_features_names = genre_names
item_features.head()

for filename in data_filenames:
    df = pd.read_csv(
        data_raw_dir + filename,
        sep='\t',
        names=[*Columns.Interactions],
        parse_dates=[Columns.Datetime]
    )
    df[Columns.Datetime] = pd.to_datetime(df[Columns.Datetime], unit='s')
    df[Columns.Weight] = df[Columns.Weight].astype(float)

    df_path_to_csv = data_benchmark_dir if filename[1] in ['a', 'b'] else data_interim_dir
    df_path_to_csv += filename + '.csv'
    df.to_csv(df_path_to_csv, index=False)
    
    # Modify and save user features
    user_features_modified = user_features.loc[user_features[Columns.User].isin(df[Columns.User])].copy()
    
    # Squeeze
    user_features_frames = []
    for feature in final_user_features_names:
        feature_frame = user_features_modified.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features_modified = pd.concat(user_features_frames)
    
    user_features_path = df_path_to_csv.replace('.csv', '_user_features.csv')
    user_features_modified.to_csv(user_features_path, index=False)

    # Modify and save item features
    item_features_modified = item_features.loc[item_features['id'].isin(df[Columns.Item])].copy()
    item_features_path = df_path_to_csv.replace('.csv', '_item_features.csv')
    item_features_modified.to_csv(item_features_path, index=False)