In [2]:
import pandas as pd
from rectools import Columns

def read_user_data(path):
    user_feature_cols = [Columns.User, 'age', 'gender', 'occupation', 'zip_code']
    user_data = pd.read_csv(path, sep='|', names=user_feature_cols)
    user_data.drop(columns=['zip_code'], inplace=True)
    user_data['age'] = user_data['age'] / user_data['age'].max()
    return user_data

def read_item_data(path, genres_list):
    item_feature_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB_URL'] + list(genres_list)
    item_info = pd.read_csv(path, sep='|', names=item_feature_cols, encoding='latin-1')
    item_info.drop(columns=['title', 'release_date', 'video_release_date', 'IMDB_URL'], inplace=True)
    return item_info

def process_interaction_data(path, file):
    interaction_data = pd.read_csv(path + file, sep='\t', names=[*Columns.Interactions], parse_dates=[Columns.Datetime])
    interaction_data[Columns.Datetime] = pd.to_datetime(interaction_data[Columns.Datetime], unit='s')
    interaction_data[Columns.Weight] = interaction_data[Columns.Weight].astype(float)
    return interaction_data

def save_to_csv(data, path):
    data.to_csv(path, index=False)

raw_data_dir = '../data/raw/'
interim_data_dir = '../data/interim/'
benchmark_data_dir = '../benchmark/data/'

data_split_filenames = [f'u{version}.{part}' for version in ['1', '2', '3', '4', '5', 'a', 'b'] for part in ['base', 'test']]

user_data = read_user_data(raw_data_dir + 'u.user')
selected_user_features = user_data.columns.drop(Columns.User).tolist()

genre_data = pd.read_csv(raw_data_dir + 'u.genre', sep='|', names=['genre', 'genre_id'])
genres_list = genre_data['genre'].unique()

item_info = read_item_data(raw_data_dir + 'u.item', genres_list)

# Save user and item data to CSV
save_to_csv(user_data, interim_data_dir + 'user_data.csv')
save_to_csv(item_info, interim_data_dir + 'item_data.csv')

for file in data_split_filenames:
    interaction_data = process_interaction_data(raw_data_dir, file)
    output_dir = benchmark_data_dir if 'a' == file[1] or 'b' == file[1] else interim_data_dir
    print(file, output_dir)
    save_to_csv(interaction_data, output_dir + file + '.csv')

u1.base ../data/interim/
u1.test ../data/interim/
u2.base ../data/interim/
u2.test ../data/interim/
u3.base ../data/interim/
u3.test ../data/interim/
u4.base ../data/interim/
u4.test ../data/interim/
u5.base ../data/interim/
u5.test ../data/interim/
ua.base ../benchmark/data/
ua.test ../benchmark/data/
ub.base ../benchmark/data/
ub.test ../benchmark/data/


In [3]:
user_data_mod = user_data.loc[user_data[Columns.User].isin(df[Columns.User])].copy()
item_features_modified = item_features.loc[item_features['id'].isin(df[Columns.Item])].copy()

Unnamed: 0,user_id,age,gender,occupation
0,1,0.328767,M,technician
1,2,0.726027,F,other
2,3,0.315068,M,writer
3,4,0.328767,M,technician
4,5,0.452055,F,other
...,...,...,...,...
938,939,0.356164,F,student
939,940,0.438356,M,administrator
940,941,0.273973,M,student
941,942,0.657534,F,librarian


In [6]:
from rectools.dataset import Dataset

dataset = Dataset.construct(df,
                                user_features_df=user_data,
                                cat_user_features=['gender', 'occupation'],  # Will be one-hot encoded
                                item_features_df=item_info,
                                make_dense_item_features=True,  # Since all features are numeric
                                )

NameError: name 'df' is not defined