In [13]:
import pickle
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset

data_raw_dir = '../data/raw/'
data_interim_dir = '../data/interim/'
data_benchmark_dir = '../benchmark/data/'
data_filenames = [f'u{t}.{split}' for t in ['1', '2', '3', '4', '5', 'a', 'b'] for split in ['base', 'test']]
data_filenames
['u1.base',
 'u1.test',
 'u2.base',
 'u2.test',
 'u3.base',
 'u3.test',
 'u4.base',
 'u4.test',
 'u5.base',
 'u5.test',
 'ua.base',
 'ua.test',
 'ub.base',
 'ub.test']

user_features_names = [Columns.User, 'age', 'gender', 'occupation', 'zip_code']

user_features = pd.read_csv(
    data_raw_dir + 'u.user',
    sep='|',
    names=user_features_names,
)
user_features.drop('zip_code', axis=1, inplace=True)
user_features.age = user_features.age / user_features.age.max()
final_user_features_names = user_features.drop(Columns.User, axis=1).columns.to_list()

genres = pd.read_csv(
    data_raw_dir + 'u.genre',
    sep='|',
    names=['genre', 'genre_id'],
)
genre_names = genres.genre.unique()
item_dates = ['release_date', 'video_release_date']
item_features_names = ['id', 'title', *item_dates, 'IMDB_URL', *genre_names]

item_features = pd.read_csv(
    data_raw_dir + 'u.item',
    encoding='latin-1',
    sep='|',
    names=item_features_names,
    parse_dates=item_dates,
)
item_features.drop(['title', *item_dates, 'IMDB_URL'], axis=1, inplace=True)
final_item_features_names = genre_names
item_features.head()

for filename in data_filenames:
    df = pd.read_csv(
        data_raw_dir + filename,
        sep='\t',
        names=[*Columns.Interactions],
        parse_dates=[Columns.Datetime]
    )
    df[Columns.Datetime] = pd.to_datetime(df[Columns.Datetime], unit='s')
    df[Columns.Weight] = df[Columns.Weight].astype(float)

    df_path_to_csv = data_benchmark_dir if filename[1] in ['a', 'b'] else data_interim_dir
    df_path_to_csv += filename + '.csv'
    df.to_csv(df_path_to_csv, index=False)
    
    # Modify and save user features
    user_features_modified = user_features.loc[user_features[Columns.User].isin(df[Columns.User])].copy()
    
    # Squeeze
    user_features_frames = []
    for feature in final_user_features_names:
        feature_frame = user_features_modified.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features_modified = pd.concat(user_features_frames)
    
    user_features_path = df_path_to_csv.replace('.csv', '_user_features.csv')
    user_features_modified.to_csv(user_features_path, index=False)

    # Modify and save item features
    item_features_modified = item_features.loc[item_features['id'].isin(df[Columns.Item])].copy()
    item_features_path = df_path_to_csv.replace('.csv', '_item_features.csv')
    item_features_modified.to_csv(item_features_path, index=False)


In [26]:
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset

# Assuming the paths are similar to those used during saving
data_interim_dir = '../data/interim/'
data_benchmark_dir = '../benchmark/data/'
data_filenames = [f'u{t}.{split}' for t in ['1', '2', '3', '4', '5', 'a', 'b'] for split in ['base', 'test']]

datasets = {}

for filename in data_filenames:
    # Paths for the CSV files
    df_path = data_benchmark_dir if filename[1] in ['a', 'b'] else data_interim_dir
    interactions_path = df_path + filename + '.csv'
    user_features_path = df_path + filename + '_user_features.csv'
    item_features_path = df_path + filename + '_item_features.csv'

    # Read the data from CSV files
    interactions_df = pd.read_csv(interactions_path)
    user_features_df = pd.read_csv(user_features_path)
    item_features_df = pd.read_csv(item_features_path)

    # Construct the dataset
    dataset = Dataset.construct(
        interactions_df,
        user_features_df=user_features_df,
        cat_user_features=['gender', 'occupation'],  # If these were the categorical features
        item_features_df=item_features_df,
        make_dense_item_features=True  # If this is still applicable
    )
    print(filename)
    key = filename.split('.')[0]
    datasets[key] = dataset

u1.base
u1.test
u2.base
u2.test
u3.base
u3.test
u4.base
u4.test
u5.base
u5.test
ua.base
ua.test
ub.base
ub.test


In [30]:
import pandas as pd

data_interim_dir = '../data/interim/'
user_groups = ['u1', 'u2', 'u3', 'u4', 'u5']
data_splits = ['base', 'test']

datasets = {}

for user_group in user_groups:
    for split in data_splits:
        # Construct file paths
        interactions_path = f"{data_interim_dir}{user_group}.{split}.csv"
        user_features_path = f"{data_interim_dir}{user_group}.{split}_user_features.csv"
        item_features_path = f"{data_interim_dir}{user_group}.{split}_item_features.csv"
        
        # Read the data from CSV files
        interactions_df = pd.read_csv(interactions_path)
        user_features_df = pd.read_csv(user_features_path)
        item_features_df = pd.read_csv(item_features_path)
        
        dataset = Dataset.construct(
            interactions_df,
            user_features_df=user_features_df,
            cat_user_features=['gender', 'occupation'],  # If these were the categorical features
            item_features_df=item_features_df,
            make_dense_item_features=True  # If this is still applicable
        )

        # Store in the data dictionary
        if user_group not in datasets:
            datasets[user_group] = {}
        
        datasets[user_group][split] = (dataset, interactions_df)


In [41]:
datasets['u1']['base'][1]

Unnamed: 0,user_id,item_id,weight,datetime
0,1,1,5.0,1997-09-22 22:02:38
1,1,2,3.0,1997-10-15 05:26:11
2,1,3,4.0,1997-11-03 07:42:40
3,1,4,3.0,1997-10-15 05:25:19
4,1,5,3.0,1998-03-13 01:15:12
...,...,...,...,...
79995,943,1067,2.0,1997-09-29 02:55:56
79996,943,1074,4.0,1998-02-28 04:30:50
79997,943,1188,3.0,1998-02-28 04:30:50
79998,943,1228,3.0,1998-02-28 04:31:15


In [49]:
from rectools.models import ImplicitItemKNNWrapperModel

OSError: dlopen(/usr/local/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so, 0x0006): Symbol not found: __ZN2at4_ops6conv1d4callERKNS_6TensorES4_RKN3c108optionalIS2_EENS5_8ArrayRefIxEESB_SB_x
  Referenced from: <0BD220B6-B664-3675-8235-BDD16C8D9117> /usr/local/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so
  Expected in:     <60EF05C3-3932-3EF3-BD03-10DEA2E89EF3> /usr/local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib

In [47]:


# k = 10
# ndcg = NDCG(k=k, log_base=3)
# acc = Accuracy(k=k)
# mmap = MAP(k=k)

for ug in datasets.keys():
    base_ds = datasets[ug]['base'][0]
    base_df = datasets[ug]['base'][1]
    test_ds = datasets[ug]['test'][0]
    test_df = datasets[ug]['test'][1]
#     base, test, base_df, test_df = datasets
#     model = RandomModel()
#     model.fit(base)
    
#     recos = model.recommend(
#         users=base_df[Columns.User].unique(),
#         dataset=base,
#         k=10,
#         filter_viewed=True,
#     )
#     print(f'RandomModel on {name} split')
#     print('MAP: ', mmap.calc(reco=recos, interactions=test_df))
#     print("Accuracy: ", acc.calc(reco=recos, interactions=test_df, catalog=base_df[Columns.Item]))
#     print("NDCG: ", ndcg.calc(reco=recos, interactions=test_df))
#     print()

       user_id  item_id  weight             datetime
0            1        6     5.0  1998-02-14 04:52:53
1            1       10     3.0  1997-10-01 08:05:18
2            1       12     5.0  1997-11-03 07:42:40
3            1       14     5.0  1997-09-22 22:01:46
4            1       17     3.0  1997-09-24 03:53:18
...        ...      ...     ...                  ...
19995      458      648     4.0  1998-02-02 05:04:59
19996      458     1101     4.0  1998-02-02 05:38:51
19997      459      934     3.0  1997-11-15 03:13:59
19998      460       10     3.0  1997-12-23 21:26:11
19999      462      682     5.0  1998-02-01 20:33:51

[20000 rows x 4 columns]
       user_id  item_id  weight             datetime
0            1        1     5.0  1997-09-22 22:02:38
1            1        2     3.0  1997-10-15 05:26:11
2            1        8     1.0  1997-09-24 03:41:24
3            1        9     5.0  1997-11-03 07:52:21
4            1       21     1.0  1997-11-03 07:39:32
...        ...      