# RecBole Research

## Atomic files and datasets

### Atomic files using datasets obtained with `generate_data.ipynb`

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

LASTFM_RB = 'lastfm_recbole'
DATA_DIR = f'../../data/recsys_data/'

all_ratings = pd.read_csv(DATA_DIR + 'all_tracks_ratings_full.csv', sep='\t')

In [8]:
all_ratings.columns = ['user_id:token', 'track_id:token', 'rating:float', 'timestamp:float']
all_ratings['timestamp:float'] = all_ratings['timestamp:float'].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timestamp() if x is not np.nan else np.nan)

all_ratings.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.inter', index=False, sep='\t')
all_ratings


Unnamed: 0,user_id:token,track_id:token,rating:float,timestamp:float
0,1,35151,5.0,
1,1,82497,3.3,
2,1,97967,4.2,
3,1,105492,3.5,
4,1,124021,3.9,
...,...,...,...,...
2200756,52829,711172,4.2,
2200757,52829,745290,1.0,1.668483e+09
2200758,52829,776707,1.0,1.668444e+09
2200759,52829,781335,1.0,1.668444e+09


In [4]:
all_users = all_ratings['user_id'].drop_duplicates().rename('user_id:token')
all_users.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.user', index=False, sep='\t')

all_users

0              1
20             2
60             3
119            4
148            5
           ...  
2200552    52825
2200587    52826
2200624    52827
2200663    52828
2200703    52829
Name: user_id:token, Length: 52780, dtype: int64

In [6]:
all_vads = pd.read_csv(DATA_DIR + 'all_tracks_vads.csv', sep='\t')
OG_COLS = ['V', 'A', 'D', 'StSc']
NORM_COLS = ['V_Norm', 'A_Norm', 'D_Norm', 'StSc_Norm']

RECBOLE_COLS = ['track_id:token', 'vadst:float_seq']

#### Option 1: Original VAD + StSc

In [4]:
og_vads = all_vads[['Track'] + OG_COLS].copy()
og_vads['VADStSc'] = og_vads[OG_COLS].astype(str).apply(lambda x: ' '.join(x), axis=1)
og_vads.drop(columns=OG_COLS, inplace=True)
og_vads.columns = RECBOLE_COLS

og_vads.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')
og_vads


Unnamed: 0,track_id:token,vadst:float
0,1,nan nan nan nan
1,2,0.6678796252023386 0.4404606082762304 0.568477...
2,3,0.587845238095238 0.4901428571428571 0.5043333...
3,4,0.6319867998279273 0.4767079600804765 0.578362...
4,5,0.6231669245647969 0.3807249516441006 0.507896...
...,...,...
815626,815627,0.6775663607732008 0.4722983474411399 0.600010...
815627,815628,0.6514011030677977 0.4618827553321515 0.575568...
815628,815629,nan nan nan nan
815629,815630,0.6084758620689655 0.4409655172413793 0.554662...


#### Option 2: Normalized VAD + StSc

In [7]:
norm_vads = all_vads[['Track'] + NORM_COLS].copy()
norm_vads['VADStSc'] = norm_vads[NORM_COLS].astype(str).apply(lambda x: ' '.join(x), axis=1)
norm_vads.drop(columns=NORM_COLS, inplace=True)
norm_vads.columns = RECBOLE_COLS

norm_vads.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')
norm_vads

Unnamed: 0,track_id:token,vadst:float_seq
0,1,nan nan nan nan
1,2,0.5715571724783821 0.4159363290920476 0.474975...
2,3,0.4619585595278849 0.4888376480452782 0.363160...
3,4,0.5224057512193457 0.4691239326199215 0.492206...
4,5,0.5103278665728133 0.3282831278710206 0.369371...
...,...,...
815626,815627,0.5848221304665535 0.4626534812049008 0.529942...
815627,815628,0.5489915824276587 0.447370147222526 0.4873355...
815628,815629,nan nan nan nan
815629,815630,0.4902100131036843 0.4166772079844157 0.450892...


## Custom model

In [None]:
from lastfm_recbole import *

