# RecBole Research

## Atomic files and datasets

### Atomic files using datasets obtained with `generate_data.ipynb`

#### __Inters atomic file__

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

LASTFM_RB = 'lastfm_recbole'
DATA_DIR = f'../../data/recsys_data/'

In [4]:
ratings = pd.read_csv(DATA_DIR + 'all_tracks_ratings_full.zip', sep='\t')

In [8]:
all_ratings = ratings.copy()

all_ratings.columns = ['user_id:token', 'track_id:token', 'rating:float', 'timestamp:float']
all_ratings['timestamp:float'] = all_ratings['timestamp:float'].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timestamp() if x is not np.nan else np.nan)

all_ratings.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.inter', index=False, sep='\t')
all_ratings


Unnamed: 0,user_id:token,track_id:token,rating:float,timestamp:float
0,1,35151,5.0,
1,1,82497,3.3,
2,1,97967,4.2,
3,1,105492,3.5,
4,1,124021,3.9,
...,...,...,...,...
2200756,52829,711172,4.2,
2200757,52829,745290,1.0,1.668483e+09
2200758,52829,776707,1.0,1.668444e+09
2200759,52829,781335,1.0,1.668444e+09


#### __Users atomic file__

In [4]:
all_users = ratings['user_id'].drop_duplicates().rename('user_id:token')
all_users.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.user', index=False, sep='\t')

all_users

0              1
20             2
60             3
119            4
148            5
           ...  
2200552    52825
2200587    52826
2200624    52827
2200663    52828
2200703    52829
Name: user_id:token, Length: 52780, dtype: int64

#### __Items atomic file__

In [2]:
OG_COLS = ['V', 'A', 'D', 'StSc']
NORM_COLS = ['V_Norm', 'A_Norm', 'D_Norm', 'StSc_Norm']

RECBOLE_TAG_COLS = ['track_id:token', 'tags:token_seq']
RECBOLE_VAD_COLS = ['track_id:token', 'vadst:float_seq']

track_data = pd.read_csv(DATA_DIR + "track_full_data.zip", sep='\t')

track_data

Unnamed: 0,track_id,tags,name,artist_id,album_id
0,1,folk folk folk folk folk folk folk folk countr...,twentyseven,136900,227929.0
1,2,christmas christmas christmas christmas christ...,Happy Xmas (War Is Over) (2003 Mix),82136,
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,Don’t Shut Me Down,127107,
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,Tirpitz,57668,144260.0
4,5,slowcore slowcore slowcore slowcore slowcore s...,You Have Cum in Your Hair and Your Dick is Han...,117533,99037.0
...,...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,D-1,71909,144316.0
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,Now or Never,125328,
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,Los Zurdos Mueren Antes,142059,
815629,815630,indie indie indie indie indie indie indie indie,Oh Mama,99271,92757.0


#### Option 1: Tag Sequences

In [3]:
grouped_tags = track_data[['track_id', 'tags']]
grouped_tags.columns = RECBOLE_TAG_COLS

grouped_tags.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

grouped_tags

Unnamed: 0,track_id:token,tags:token_seq
0,1,folk folk folk folk folk folk folk folk countr...
1,2,christmas christmas christmas christmas christ...
2,3,wrong wrong wrong wrong wrong wrong wrong wrong
3,4,melodicdeathmetal melodicdeathmetal melodicdea...
4,5,slowcore slowcore slowcore slowcore slowcore s...
...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...
815629,815630,indie indie indie indie indie indie indie indie


#### Option 2A: Artists

No sparsity since all tracks must have an artist

In [11]:
track_artists = track_data[['track_id', 'artist_id']]
track_artists = track_artists.rename(lambda x: x + ':token', axis=1)

track_artists.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

track_artists

Unnamed: 0,track_id:token,artist_id:token
0,1,136900
1,2,82136
2,3,127107
3,4,57668
4,5,117533
...,...,...
815626,815627,71909
815627,815628,125328
815628,815629,142059
815629,815630,99271


#### Option 2B: Artists + Albums

Only albums would not improve because of lack of data and sparsity

In [5]:
track_artists_album = track_data[['track_id', 'artist_id', 'album_id']]
track_artists_album = track_artists_album.rename(lambda x: x + ':token', axis=1)

track_artists_album.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

track_artists_album

Unnamed: 0,track_id:token,artist_id:token,album_id:token
0,1,136900,227929.0
1,2,82136,
2,3,127107,
3,4,57668,144260.0
4,5,117533,99037.0
...,...,...,...
815626,815627,71909,144316.0
815627,815628,125328,
815628,815629,142059,
815629,815630,99271,92757.0


#### Option 3: Tag Sequences + Artists + Albums

In [6]:
track_all_no_vads = track_data[['track_id', 'tags', 'artist_id', 'album_id']]
track_all_no_vads = track_all_no_vads.rename(lambda x: x + ':token', axis=1).rename(columns={'tags:token': RECBOLE_TAG_COLS[1]})

track_all_no_vads.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

track_all_no_vads

Unnamed: 0,track_id:token,tags:token_seq,artist_id:token,album_id:token
0,1,folk folk folk folk folk folk folk folk countr...,136900,227929.0
1,2,christmas christmas christmas christmas christ...,82136,
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,127107,
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,57668,144260.0
4,5,slowcore slowcore slowcore slowcore slowcore s...,117533,99037.0
...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,71909,144316.0
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,125328,
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,142059,
815629,815630,indie indie indie indie indie indie indie indie,99271,92757.0


#### Option 4A: Original VAD + StSc

In [4]:
all_vads = pd.read_csv(DATA_DIR + 'all_tracks_vads.zip', sep='\t')

og_vads = all_vads[['Track'] + OG_COLS].copy()
og_vads['VADStSc'] = og_vads[OG_COLS].astype(str).apply(lambda x: ' '.join(x), axis=1)
og_vads.drop(columns=OG_COLS, inplace=True)
og_vads.columns = RECBOLE_VAD_COLS

og_vads.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')
og_vads


Unnamed: 0,track_id:token,vadst:float
0,1,nan nan nan nan
1,2,0.6678796252023386 0.4404606082762304 0.568477...
2,3,0.587845238095238 0.4901428571428571 0.5043333...
3,4,0.6319867998279273 0.4767079600804765 0.578362...
4,5,0.6231669245647969 0.3807249516441006 0.507896...
...,...,...
815626,815627,0.6775663607732008 0.4722983474411399 0.600010...
815627,815628,0.6514011030677977 0.4618827553321515 0.575568...
815628,815629,nan nan nan nan
815629,815630,0.6084758620689655 0.4409655172413793 0.554662...


#### Option 4B: Normalized VAD + StSc

In [7]:
all_vads = pd.read_csv(DATA_DIR + 'all_tracks_vads.zip', sep='\t')

norm_vads = all_vads[['Track'] + NORM_COLS].copy()
norm_vads['VADStSc'] = norm_vads[NORM_COLS].astype(str).apply(lambda x: ' '.join(x), axis=1)
norm_vads.drop(columns=NORM_COLS, inplace=True)
norm_vads.columns = RECBOLE_VAD_COLS

norm_vads.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')
norm_vads

Unnamed: 0,track_id:token,vadst:float_seq
0,1,nan nan nan nan
1,2,0.5715571724783821 0.4159363290920476 0.474975...
2,3,0.4619585595278849 0.4888376480452782 0.363160...
3,4,0.5224057512193457 0.4691239326199215 0.492206...
4,5,0.5103278665728133 0.3282831278710206 0.369371...
...,...,...
815626,815627,0.5848221304665535 0.4626534812049008 0.529942...
815627,815628,0.5489915824276587 0.447370147222526 0.4873355...
815628,815629,nan nan nan nan
815629,815630,0.4902100131036843 0.4166772079844157 0.450892...


#### Option 5: Tag Sequences + Artists + Albums + Normalized VADs + StSc

In [9]:
all_vads = pd.read_csv(DATA_DIR + 'all_tracks_vads.zip', sep='\t')

norm_vads = all_vads[NORM_COLS].astype(str).apply(lambda x: ' '.join(x), axis=1)

track_all_vads = track_data[['track_id', 'tags', 'artist_id', 'album_id']]
track_all_vads = track_all_no_vads.rename(lambda x: x + ':token', axis=1).rename(columns={'tags:token': RECBOLE_TAG_COLS[1]})
track_all_vads['vadst:float_seq'] = norm_vads

track_all_vads.to_csv(f'{DATA_DIR}/{LASTFM_RB}/{LASTFM_RB}.item', index=False, sep='\t')

track_all_vads

Unnamed: 0,track_id:token:token,tags:token_seq:token,artist_id:token:token,album_id:token:token,vadst:float_seq
0,1,folk folk folk folk folk folk folk folk countr...,136900,227929.0,nan nan nan nan
1,2,christmas christmas christmas christmas christ...,82136,,0.5715571724783821 0.4159363290920476 0.474975...
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,127107,,0.4619585595278849 0.4888376480452782 0.363160...
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,57668,144260.0,0.5224057512193457 0.4691239326199215 0.492206...
4,5,slowcore slowcore slowcore slowcore slowcore s...,117533,99037.0,0.5103278665728133 0.3282831278710206 0.369371...
...,...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,71909,144316.0,0.5848221304665535 0.4626534812049008 0.529942...
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,125328,,0.5489915824276587 0.447370147222526 0.4873355...
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,142059,,nan nan nan nan
815629,815630,indie indie indie indie indie indie indie indie,99271,92757.0,0.4902100131036843 0.4166772079844157 0.450892...
