In [7]:
import pickle
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset

In [19]:
data_raw_dir = '../data/raw/'
data_interim_dir = '../data/interim/'
data_benchmark_dir = '../benchmark/data/'
# Filename patterns
file_patterns = ['u{}.{}'.format(user_group, data_type) for user_group in ['1', '2', '3', '4', '5', 'a', 'b'] for data_type in ['base', 'test']]

In [9]:
# Load user features
user_cols = [Columns.User, 'age', 'gender', 'occupation', 'zip_code']
user_data = pd.read_csv(raw_data_dir + 'u.user', sep='|', names=user_cols)
user_data = user_data.drop(columns=['zip_code'])
user_data['age'] = user_data['age'] / user_data['age'].max()
user_feature_cols = user_data.columns.drop(Columns.User)

In [11]:
user_data

Unnamed: 0,user_id,age,gender,occupation
0,1,0.328767,M,technician
1,2,0.726027,F,other
2,3,0.315068,M,writer
3,4,0.328767,M,technician
4,5,0.452055,F,other
...,...,...,...,...
938,939,0.356164,F,student
939,940,0.438356,M,administrator
940,941,0.273973,M,student
941,942,0.657534,F,librarian


In [17]:
# Load item genres
genres = pd.read_csv(raw_data_dir + 'u.genre', sep='|', names=['genre', 'genre_id'])
genre_list = genres['genre'].unique()
genre_list

array(['unknown', 'Action', 'Adventure', 'Animation', "Children's",
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'], dtype=object)

In [18]:
# Load item features
item_cols = ['id', 'title', 'release_date', 'video_release_date', 'IMDB_URL'] + list(genre_list)
item_data = pd.read_csv(raw_data_dir + 'u.item', sep='|', names=item_cols, encoding='latin-1')
item_data = item_data.drop(columns=['title', 'release_date', 'video_release_date', 'IMDB_URL'])
item_data

Unnamed: 0,id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
for file_name in file_patterns:
    print(file_name)
    interactions = pd.read_csv(raw_data_dir + file_name, sep='\t', names=list(Columns.Interactions), parse_dates=[Columns.Datetime])
    interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], unit='s')
    interactions[Columns.Weight] = interactions[Columns.Weight].astype(float)

    save_path = benchmark_data_dir if 'a' in file_name or 'b' in file_name else interim_data_dir
    interactions.to_csv(f"{save_path}{file_name}.csv", index=False)

    # User features
    relevant_users = user_data[user_data[Columns.User].isin(interactions[Columns.User])]
    user_features = pd.melt(relevant_users, id_vars=[Columns.User], value_vars=user_feature_cols)
    user_features.columns = ['id', 'value', 'feature']
    user_features.to_csv(f"{save_path}{file_name}_user_features.csv", index=False)

    # Item features
    relevant_items = item_data[item_data['id'].isin(interactions[Columns.Item])]
    relevant_items.to_csv(f"{save_path}{file_name}_item_features.csv", index=False)

u1.base
u1.test
u2.base
u2.test
u3.base
u3.test
u4.base
u4.test
u5.base
u5.test
ua.base
ua.test
ub.base
ub.test
