# RecBole Research

## Atomic files and datasets

##### Atomic files using datasets obtained with `generate_data.ipynb`

### __Inters atomic file__

In [1]:
import pandas as pd
import numpy as np
from time import mktime
from datetime import datetime

LASTFM_RB = 'lastfm_recbole'
DATA_DIR = f'../../data/recsys_data/'

In [2]:
ratings = pd.read_csv(DATA_DIR + 'all_tracks_ratings_full.zip', sep='\t')

In [11]:
all_ratings = ratings.copy()

all_ratings.columns = ['user_id:token', 'track_id:token', 'rating:float', 'timestamp:float']
all_ratings['timestamp:float'] = all_ratings['timestamp:float'].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timestamp() if x is not np.nan else np.nan)

all_ratings

Unnamed: 0,user_id:token,track_id:token,rating:float,timestamp:float
0,1,35151,5.0,
1,1,82497,3.3,
2,1,97967,4.2,
3,1,105492,3.5,
4,1,124021,3.9,
...,...,...,...,...
2200756,52829,711172,4.2,
2200757,52829,745290,1.0,1.668483e+09
2200758,52829,776707,1.0,1.668444e+09
2200759,52829,781335,1.0,1.668444e+09


In [9]:
all_ratings.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.inter', index=False, sep='\t')

### __Users atomic file__

In [4]:
all_users = ratings['user_id'].drop_duplicates().rename('user_id:token')
all_users.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.user', index=False, sep='\t')

all_users

0              1
20             2
60             3
119            4
148            5
           ...  
2200552    52825
2200587    52826
2200624    52827
2200663    52828
2200703    52829
Name: user_id:token, Length: 52780, dtype: int64

### __Items atomic file__

Choose columns to load by editing `config.yaml`

In [20]:
OG_COLS = ['V', 'A', 'D', 'StSc']
NORM_COLS = ['V_Norm', 'A_Norm', 'D_Norm', 'StSc_Norm']

RECBOLE_TAG_COL = 'tags:token_seq'
RECBOLE_VAD_COLS = ['v:float','a:float','d:float','stsc:float']

track_data = pd.read_csv(DATA_DIR + "track_full_data.zip", sep='\t', dtype={'album_id': 'Int64'})

track_data

Unnamed: 0,track_id,tags,name,artist_id,album_id
0,1,folk folk folk folk folk folk folk folk countr...,twentyseven,136900,227929
1,2,christmas christmas christmas christmas christ...,Happy Xmas (War Is Over) (2003 Mix),82136,
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,Don’t Shut Me Down,127107,
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,Tirpitz,57668,144260
4,5,slowcore slowcore slowcore slowcore slowcore s...,You Have Cum in Your Hair and Your Dick is Han...,117533,99037
...,...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,D-1,71909,144316
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,Now or Never,125328,
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,Los Zurdos Mueren Antes,142059,
815629,815630,indie indie indie indie indie indie indie indie,Oh Mama,99271,92757


#### Load all Data with NaNs

In [21]:
track_all = track_data[['track_id', 'tags', 'artist_id', 'album_id']]
track_all = track_all.rename(lambda x: x + ':token', axis=1).rename(columns={'tags:token': RECBOLE_TAG_COL})

track_all

Unnamed: 0,track_id:token,tags:token_seq,artist_id:token,album_id:token
0,1,folk folk folk folk folk folk folk folk countr...,136900,227929
1,2,christmas christmas christmas christmas christ...,82136,
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,127107,
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,57668,144260
4,5,slowcore slowcore slowcore slowcore slowcore s...,117533,99037
...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,71909,144316
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,125328,
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,142059,
815629,815630,indie indie indie indie indie indie indie indie,99271,92757


In [None]:
track_all.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

### __Include VAD values__

#### Option A: Original VAD + StSc

In [16]:
all_vads = pd.read_csv(DATA_DIR + 'all_tracks_vads.zip', sep='\t')

vads = all_vads[['Track'] + OG_COLS].copy()
vads

Unnamed: 0,Track,V,A,D,StSc
0,1,,,,
1,2,0.667880,0.440461,0.568477,0.720910
2,3,0.587845,0.490143,0.504333,0.555556
3,4,0.631987,0.476708,0.578362,0.696299
4,5,0.623167,0.380725,0.507896,0.654302
...,...,...,...,...,...
815626,815627,0.677566,0.472298,0.600010,0.875615
815627,815628,0.651401,0.461883,0.575568,0.751494
815628,815629,,,,
815629,815630,0.608476,0.440966,0.554662,0.634783


#### Option B: Normalized VAD + StSc

In [22]:
all_vads = pd.read_csv(DATA_DIR + 'all_tracks_vads.zip', sep='\t')

vads = all_vads[['Track'] + NORM_COLS].copy()
vads

Unnamed: 0,Track,V_Norm,A_Norm,D_Norm,StSc_Norm
0,1,,,,
1,2,0.571557,0.415936,0.474975,0.860455
2,3,0.461959,0.488838,0.363161,0.777778
3,4,0.522406,0.469124,0.492206,0.848150
4,5,0.510328,0.328283,0.369372,0.827151
...,...,...,...,...,...
815626,815627,0.584822,0.462653,0.529943,0.937808
815627,815628,0.548992,0.447370,0.487336,0.875747
815628,815629,,,,
815629,815630,0.490210,0.416677,0.450893,0.817391


#### Merge Track Data with VAD + St

In [23]:
track_all[RECBOLE_VAD_COLS] = vads[NORM_COLS]
track_all

Unnamed: 0,track_id:token,tags:token_seq,artist_id:token,album_id:token,v:float,a:float,d:float,stsc:float
0,1,folk folk folk folk folk folk folk folk countr...,136900,227929,,,,
1,2,christmas christmas christmas christmas christ...,82136,,0.571557,0.415936,0.474975,0.860455
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,127107,,0.461959,0.488838,0.363161,0.777778
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,57668,144260,0.522406,0.469124,0.492206,0.848150
4,5,slowcore slowcore slowcore slowcore slowcore s...,117533,99037,0.510328,0.328283,0.369372,0.827151
...,...,...,...,...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,71909,144316,0.584822,0.462653,0.529943,0.937808
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,125328,,0.548992,0.447370,0.487336,0.875747
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,142059,,,,,
815629,815630,indie indie indie indie indie indie indie indie,99271,92757,0.490210,0.416677,0.450893,0.817391


In [7]:
track_all.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

#### __Optional 1__: Replace VAD NaN values

In [28]:
track_all = track_all.copy()
track_all[RECBOLE_VAD_COLS] = track_all[RECBOLE_VAD_COLS].fillna(0.0)
track_all

Unnamed: 0,track_id:token,tags:token_seq,artist_id:token,album_id:token,v:float,a:float,d:float,stsc:float
0,1,folk folk folk folk folk folk folk folk countr...,136900,227929,0.000000,0.000000,0.000000,0.000000
1,2,christmas christmas christmas christmas christ...,82136,,0.571557,0.415936,0.474975,0.860455
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,127107,,0.461959,0.488838,0.363161,0.777778
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,57668,144260,0.522406,0.469124,0.492206,0.848150
4,5,slowcore slowcore slowcore slowcore slowcore s...,117533,99037,0.510328,0.328283,0.369372,0.827151
...,...,...,...,...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,71909,144316,0.584822,0.462653,0.529943,0.937808
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,125328,,0.548992,0.447370,0.487336,0.875747
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,142059,,0.000000,0.000000,0.000000,0.000000
815629,815630,indie indie indie indie indie indie indie indie,99271,92757,0.490210,0.416677,0.450893,0.817391


In [8]:
track_all.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

#### __Optional 2__: Drop NaN values (row-wise)

In [29]:
track_all = track_all.dropna(axis=0).copy()

track_all

Unnamed: 0,track_id:token,tags:token_seq,artist_id:token,album_id:token,v:float,a:float,d:float,stsc:float
0,1,folk folk folk folk folk folk folk folk countr...,136900,227929,0.000000,0.000000,0.000000,0.000000
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,57668,144260,0.522406,0.469124,0.492206,0.848150
4,5,slowcore slowcore slowcore slowcore slowcore s...,117533,99037,0.510328,0.328283,0.369372,0.827151
5,6,broadwayobc broadwayobc broadwayobc broadwayob...,95498,383880,0.438683,0.512938,0.416816,0.738462
6,7,melodicdeathmetal melodicdeathmetal melodicdea...,38330,249562,0.446873,0.483686,0.434426,0.736056
...,...,...,...,...,...,...,...,...
815623,815624,poppunk poppunk poppunk poppunk poppunk poppun...,29854,222810,0.000000,0.000000,0.000000,0.000000
815624,815625,canadian canadian canadian canadian canadian c...,95517,262444,0.553930,0.460565,0.524655,0.880367
815625,815626,rock rock rock rock rock rock rock rock female...,30287,12976,0.583113,0.461535,0.512472,0.899099
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,71909,144316,0.584822,0.462653,0.529943,0.937808


In [None]:
track_all.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')