# Dataset Generation

In [1]:
from db_utils import *

DATA_PATH = '../data/recsys_data/'

## Generation of User-Track-Rating Dataset

### 1. User's Top Tracks

In [2]:
top_tracks = all_top_tracks()
top_tracks

Unnamed: 0,user_id,track_id,rank
0,1,35151,1
1,1,560787,2
2,1,231167,3
3,1,302447,4
4,1,156880,5
...,...,...,...
1054551,52829,616532,16
1054552,52829,298170,17
1054553,52829,149950,18
1054554,52829,162448,19


In [None]:
PATH = f"{DATA_PATH}/top_tracks_ratings.zip"

# Convert rank into a rating from 5 to 3.1 / 5
import numpy as np

ratings = np.linspace(5, 3.1, 20, dtype=np.float32)
print(f"Ratings by rank: {ratings}")

top_track_ratings = top_tracks[['user_id', 'track_id']]
top_track_ratings['rating'] = top_tracks['rank'].apply(lambda x: ratings[x-1])

top_track_ratings

Ratings by rank: [5.  4.9 4.8 4.7 4.6 4.5 4.4 4.3 4.2 4.1 4.  3.9 3.8 3.7 3.6 3.5 3.4 3.3
 3.2 3.1]


Unnamed: 0,user_id,track_id,rating
0,1,35151,5.0
1,1,560787,4.9
2,1,231167,4.8
3,1,302447,4.7
4,1,156880,4.6
...,...,...,...
1054551,52829,616532,3.5
1054552,52829,298170,3.4
1054553,52829,149950,3.3
1054554,52829,162448,3.2


In [None]:
top_track_ratings.to_csv(PATH, index=False, sep='\t')

### 2. User's Recent Tracks

In [None]:
recent_tracks = all_recent_tracks()
recent_tracks

Unnamed: 0,user_id,track_id,listen_at
0,2,785090,2022-11-12 09:22:07
1,2,273675,2022-11-12 09:27:09
2,2,462697,2022-11-12 09:31:14
3,2,9158,2022-11-12 09:36:13
4,2,180323,2022-11-12 09:39:16
...,...,...,...
1019975,52829,554368,2022-11-15 04:27:34
1019976,52829,745290,2022-11-15 04:32:42
1019977,52829,539622,2022-11-15 04:37:47
1019978,52829,677435,2022-11-15 04:44:16


In [None]:
user_track_counts = recent_tracks.drop(columns='listen_at').groupby('user_id').value_counts()
avg_repeat = user_track_counts.mean()
max_repeat = user_track_counts.max()

print(f"Average repeat listens: {avg_repeat.round(3)}")
print(f"Most repeat listens: {max_repeat}")

Average repeat listens: 1.183
Most repeat listens: 20


In [None]:
import numpy as np
from tqdm import tqdm

PATH = f"{DATA_PATH}/recent_tracks_ratings.zip"

# Convert heard songs into a rating from 1 to 5 based on listen count
recent_track_ratings = recent_tracks[['user_id', 'track_id']]

# Listen count intervals to assign ratings
limits = list(np.linspace(max_repeat // 2, 1, 5, dtype=int)) + [0]
limits[1], limits[0] = limits[0], max_repeat
ratings = [5, 4, 3, 2, 1]
print(f"Listen count intervals: {limits}")

lim_ratings = list(enumerate(zip(limits[:-1], ratings)))

for user in tqdm(recent_tracks['user_id'].unique(), desc="Assigning ratings per user"):
    user_listen_count = user_track_counts[user]
    user_filter = recent_track_ratings['user_id'].values == user

    for i, (lim, r) in lim_ratings:
        track_ids = user_listen_count[(user_listen_count.values <= lim) & (user_listen_count.values > limits[i+1])].index
        track_intval_filter = user_filter & np.isin(recent_track_ratings['track_id'], track_ids)
        recent_track_ratings.loc[track_intval_filter, 'rating'] = r

recent_track_ratings['timestamp'] = recent_tracks['listen_at']

# Keep only one listen for repeats, the latest one
recent_track_ratings.drop_duplicates(subset=['user_id', 'track_id'], inplace=True, keep='last')

recent_track_ratings

Listen count intervals: [20, 10, 5, 3, 1, 0]


Assigning ratings per user: 100%|██████████| 51129/51129 [11:24<00:00, 74.66it/s] 


Unnamed: 0,user_id,track_id,rating,timestamp
0,2,785090,1.0,2022-11-12 09:22:07
1,2,273675,1.0,2022-11-12 09:27:09
2,2,462697,1.0,2022-11-12 09:31:14
3,2,9158,1.0,2022-11-12 09:36:13
4,2,180323,1.0,2022-11-12 09:39:16
...,...,...,...,...
1019975,52829,554368,1.0,2022-11-15 04:27:34
1019976,52829,745290,1.0,2022-11-15 04:32:42
1019977,52829,539622,1.0,2022-11-15 04:37:47
1019978,52829,677435,1.0,2022-11-15 04:44:16


In [None]:
recent_track_ratings.to_csv(PATH, index=False, sep='\t')

### 3. User's Loved Tracks

In [None]:
loved_tracks = all_loved_tracks()
loved_tracks

Unnamed: 0,user_id,track_id,love_at
0,3,762797,2022-06-27 11:29:18
1,3,169241,2022-06-27 11:29:20
2,3,237435,2022-06-27 11:29:29
3,3,708715,2022-06-27 11:29:31
4,3,695146,2022-07-05 09:50:14
...,...,...,...
377621,52829,186736,2022-11-01 14:14:07
377622,52829,30097,2022-11-01 14:14:14
377623,52829,42110,2022-11-01 14:15:02
377624,52829,59138,2022-11-01 14:19:24


In [None]:
PATH = f"{DATA_PATH}/loved_tracks_ratings.zip"

# Convert heard songs into a rating of 5 / 5
loved_track_ratings = loved_tracks[['user_id', 'track_id']]
loved_track_ratings['rating'] = 5
loved_track_ratings['timestamp'] = loved_tracks['love_at']

loved_track_ratings

Unnamed: 0,user_id,track_id,rating,timestamp
0,3,762797,5,2022-06-27 11:29:18
1,3,169241,5,2022-06-27 11:29:20
2,3,237435,5,2022-06-27 11:29:29
3,3,708715,5,2022-06-27 11:29:31
4,3,695146,5,2022-07-05 09:50:14
...,...,...,...,...
377621,52829,186736,5,2022-11-01 14:14:07
377622,52829,30097,5,2022-11-01 14:14:14
377623,52829,42110,5,2022-11-01 14:15:02
377624,52829,59138,5,2022-11-01 14:19:24


In [None]:
loved_track_ratings.to_csv(PATH, index=False, sep='\t')

### 4. Merge All Tracks

#### Discarding tracks listened only once

Merging all three datasets by concatenation. Duplicates are dropped by keeping the highest rating.

In [None]:
PATH = f"{DATA_PATH}/all_tracks_ratings.zip"
MIN_PLAYCOUNT = 2 # If only 1, useless in colab. filtering recommendation

final_ratings_min = merge_tracks([top_track_ratings, recent_track_ratings, loved_track_ratings], MIN_PLAYCOUNT)
final_ratings_min

Unnamed: 0,user_id,track_id,rating,timestamp
0,1,35151,5.0,NaT
1,1,82497,3.3,NaT
2,1,97967,4.2,NaT
3,1,105492,3.5,NaT
4,1,124021,3.9,NaT
...,...,...,...,...
1655429,52829,685715,5.0,2022-11-01 14:19:38
1655430,52829,687347,5.0,2022-11-01 14:10:16
1655431,52829,688480,4.9,NaT
1655432,52829,745290,1.0,2022-11-15 04:32:42


In [None]:
final_ratings_min.to_csv(PATH, index=False, sep='\t')

#### Including all tracks

In [None]:
PATH = f"{DATA_PATH}/all_tracks_ratings_full.zip"

final_ratings_full = merge_tracks([top_track_ratings, recent_track_ratings, loved_track_ratings])
final_ratings_full

Unnamed: 0,user_id,track_id,rating,timestamp
0,1,35151,5.0,NaT
1,1,82497,3.3,NaT
2,1,97967,4.2,NaT
3,1,105492,3.5,NaT
4,1,124021,3.9,NaT
...,...,...,...,...
2200756,52829,711172,4.2,NaT
2200757,52829,745290,1.0,2022-11-15 04:32:42
2200758,52829,776707,1.0,2022-11-14 17:46:56
2200759,52829,781335,1.0,2022-11-14 17:37:05


In [None]:
final_ratings_full.to_csv(PATH, index=False, sep='\t')

## Generation of Item-VAD Dataset

In [None]:
track_vad = get_item_vad(TRACK, 'Track')
track_vad

Unnamed: 0,Track,Name,V,A,D,StSc
0,1,twentyseven,,,,
1,2,Happy Xmas (War Is Over) (2003 Mix),0.667880,0.440461,0.568477,0.720910
2,3,Don’t Shut Me Down,0.587845,0.490143,0.504333,0.555556
3,4,Tirpitz,,,,
4,5,You Have Cum in Your Hair and Your Dick is Han...,0.623167,0.380725,0.507896,0.654302
...,...,...,...,...,...,...
815626,815627,D-1,0.677566,0.472298,0.600010,0.875615
815627,815628,Now or Never,,,,
815628,815629,Los Zurdos Mueren Antes,,,,
815629,815630,Oh Mama,0.608476,0.440966,0.554662,0.634783


In [None]:
t_artist_vad = get_track_artist_vads()
t_artist_vad

Unnamed: 0,Track,V,A,D,StSc
0,2,0.662364,0.453781,0.582089,0.769586
1,4,0.561424,0.483773,0.533182,0.378588
2,8,0.613225,0.473498,0.544577,0.590839
3,9,0.631987,0.476708,0.578362,0.696299
4,12,0.623932,0.446357,0.546906,0.670236
...,...,...,...,...,...
815626,815592,0.655076,0.472305,0.558858,0.744371
815627,815597,0.651401,0.461883,0.575568,0.751494
815628,815606,0.592959,0.474800,0.563727,0.583555
815629,815615,0.636870,0.457241,0.574222,0.684615


In [None]:
t_album_vad = get_track_album_vads()
t_album_vad

Unnamed: 0,Track,V,A,D,StSc
0,12,0.632310,0.444328,0.556632,0.718256
1,19,,,,
2,25,0.584914,0.483083,0.550804,0.454361
3,30,,,,
4,44,,,,
...,...,...,...,...,...
604199,815586,0.628460,0.463568,0.570361,0.705924
604200,815588,,,,
604201,815589,,,,
604202,815601,,,,


In [None]:
print(f"Tracks with VAD: {track_vad['V'].notna().sum()}")
print(f"Tracks with artist's VAD: {t_artist_vad['V'].notna().sum()}")
print(f"Tracks with album's VAD: {t_album_vad['V'].notna().sum()}")

Tracks with VAD: 361091
Tracks with artist's VAD: 759499
Tracks with album's VAD: 318036


Not worth it to use albums to fill this time, so use only artists

#### Use Artist VADs to fill NaN values for Track

In [None]:
PATH = f"{DATA_PATH}/all_tracks_vads.zip"

track_isna = track_vad['V'].isna()
nan_tracks = track_vad[track_isna]['Track'].values

merged_tracks = track_vad.copy()
replace_cols = ['V', 'A', 'D', 'StSc']
merged_tracks.loc[merged_tracks['Track'].isin(
    nan_tracks), replace_cols] = t_artist_vad.loc[t_artist_vad['Track'].isin(nan_tracks), replace_cols]

print(f"Final tracks with VAD: {merged_tracks['V'].notna().sum()}")

merged_tracks


Final tracks with VAD: 583710


Unnamed: 0,Track,Name,V,A,D,StSc
0,1,twentyseven,,,,
1,2,Happy Xmas (War Is Over) (2003 Mix),0.667880,0.440461,0.568477,0.720910
2,3,Don’t Shut Me Down,0.587845,0.490143,0.504333,0.555556
3,4,Tirpitz,0.631987,0.476708,0.578362,0.696299
4,5,You Have Cum in Your Hair and Your Dick is Han...,0.623167,0.380725,0.507896,0.654302
...,...,...,...,...,...,...
815626,815627,D-1,0.677566,0.472298,0.600010,0.875615
815627,815628,Now or Never,0.651401,0.461883,0.575568,0.751494
815628,815629,Los Zurdos Mueren Antes,,,,
815629,815630,Oh Mama,0.608476,0.440966,0.554662,0.634783


#### Normalize with Tag VADs

In [None]:
tags_vad = get_item_vad(TAG, 'Tag')

# Get normalized columns
merged_tracks['V_Norm'] = normalize(merged_tracks['V'], kind='minmax', usecol=tags_vad['V'])
merged_tracks['A_Norm'] = normalize(merged_tracks['A'], kind='minmax', usecol=tags_vad['A'])
merged_tracks['D_Norm'] = normalize(merged_tracks['D'], kind='minmax', usecol=tags_vad['D'])
merged_tracks['StSc_Norm'] = normalize(merged_tracks['StSc'], kind='minmax', usecol=tags_vad['StSc'])

merged_tracks

Unnamed: 0,Track,Name,V,A,D,StSc,V_Norm,A_Norm,D_Norm,StSc_Norm
0,1,twentyseven,,,,,,,,
1,2,Happy Xmas (War Is Over) (2003 Mix),0.667880,0.440461,0.568477,0.720910,0.571557,0.415936,0.474975,0.860455
2,3,Don’t Shut Me Down,0.587845,0.490143,0.504333,0.555556,0.461959,0.488838,0.363161,0.777778
3,4,Tirpitz,0.631987,0.476708,0.578362,0.696299,0.522406,0.469124,0.492206,0.848150
4,5,You Have Cum in Your Hair and Your Dick is Han...,0.623167,0.380725,0.507896,0.654302,0.510328,0.328283,0.369372,0.827151
...,...,...,...,...,...,...,...,...,...,...
815626,815627,D-1,0.677566,0.472298,0.600010,0.875615,0.584822,0.462653,0.529943,0.937808
815627,815628,Now or Never,0.651401,0.461883,0.575568,0.751494,0.548992,0.447370,0.487336,0.875747
815628,815629,Los Zurdos Mueren Antes,,,,,,,,
815629,815630,Oh Mama,0.608476,0.440966,0.554662,0.634783,0.490210,0.416677,0.450893,0.817391


In [None]:
merged_tracks.to_csv(PATH, index=False, sep='\t')

## Generation of Tag Sequences

#### Option 1: Using Tracks own Tags (More specific, less tracks, more sparse)

In [None]:
raw_track_tags = get_track_own_tags()
print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags.head(5)

Unique tracks: 363140


Unnamed: 0,track_id,tag,rank
0,2,christmas,1
1,2,xmas,2
2,2,john lennon,3
3,2,tinsel,4
4,2,x-mas,5


#### Option 2: Using Track Artist's Tags (Less specific, more tracks, less sparse)

In [None]:
raw_track_tags = get_track_artist_tags()
print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags.head(5)

Unique tracks: 759923


Unnamed: 0,track_id,tag,rank
0,1,folk,1
1,1,country,2
2,1,psychedelic folk,3
3,1,rock,4
4,1,american,5


#### Option 3: Using Track Album's Tags (Middle ground, though less tracks)

In [None]:
raw_track_tags = get_track_album_tags()
print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags.head(5)

Unique tracks: 319156


Unnamed: 0,track_id,tag,rank
0,4,melodic death metal,1
1,4,melodic deathcore,2
2,4,best albums 2020,3
3,5,albini,1
4,5,pop topp 30 1996,2


#### Option 4: Merging Track and Track Artist/Album's Tags

In [16]:
def merge_tracks_tags():
    track_own_tags = get_track_own_tags()
    track_artist_tags = get_track_artist_tags()
    track_album_tags = get_track_album_tags()

    # Remove irrelevant tags
    track_own_tags = track_own_tags[['track' not in t for t in track_own_tags.tag]]
    track_album_tags = track_album_tags[['album' not in t for t in track_album_tags.tag]]
    track_artist_tags = track_artist_tags[['artist' not in t for t in track_artist_tags.tag]]

    # Merge tags not in set by priority -> track > album > artist
    new_tracks_album = track_album_tags.loc[~track_album_tags.track_id.isin(track_own_tags.track_id)]
    raw_track_tags = pd.concat([track_own_tags, new_tracks_album])

    new_tracks_artist = track_artist_tags.loc[~track_artist_tags.track_id.isin(raw_track_tags.track_id)]
    raw_track_tags = pd.concat([raw_track_tags, new_tracks_artist]).reset_index(drop=True)

    return raw_track_tags

raw_track_tags = merge_tracks_tags()

print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags

Unique tracks: 761577


Unnamed: 0,track_id,tag,rank
0,2,christmas,1
1,2,xmas,2
2,2,john lennon,3
3,2,tinsel,4
4,2,x-mas,5
...,...,...,...
5564379,815629,spanish rap,6
5564380,815629,nach,7
5564381,815629,hiphop,8
5564382,815629,spanish hip hop,9


#### Pre-processing

In [17]:
REPETITION_BY_RANK = [8, 7, 5, 4, 3, 1, 1, 1, 1, 1]

def tags_repeated_by_rank(track_tags, repetition_by_rank: list = REPETITION_BY_RANK):
    # Strip spaces and hyphens and multiply words depending on rank
    track_tags = track_tags.copy()
    track_tags['tag'] = track_tags['tag'].apply(lambda x: x.replace(' ', '').replace('-', ''))

    # Remove tags with just one track
    track_tags = track_tags[track_tags.groupby('tag').tag.transform('count') > 1]  # NOTE: Keep for model testing

    # Repeat based on rank
    track_tags = track_tags.loc[track_tags.index.repeat(track_tags['rank'].apply(lambda x: repetition_by_rank[x-1]))]

    # Group by track id and aggregate into lists
    grouped_tags = track_tags.groupby(track_tags['track_id'])['tag'].apply(list).reset_index(name='tags')

    # Create soup of words for each track
    grouped_tags['tags'] = grouped_tags['tags'].apply(lambda x: ' '.join(x))

    # Set track_id as index
    grouped_tags = grouped_tags.reset_index(drop=True)

    return grouped_tags

grouped_tags = tags_repeated_by_rank(raw_track_tags)

grouped_tags

Unnamed: 0,track_id,tags
0,1,folk folk folk folk folk folk folk folk countr...
1,2,christmas christmas christmas christmas christ...
2,3,wrong wrong wrong wrong wrong wrong wrong wrong
3,4,melodicdeathmetal melodicdeathmetal melodicdea...
4,5,slowcore slowcore slowcore slowcore slowcore s...
...,...,...
757597,815627,frenchpop frenchpop frenchpop frenchpop french...
757598,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...
757599,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...
757600,815630,indie indie indie indie indie indie indie indie


In [18]:
def merge_tracks_data(df: pd.DataFrame):
    track_table = get_table_df('track')
    track_table.drop(columns='vad', inplace=True)  # Already generated previously in all_tracks_vads.zip
    track_table.rename(columns={'id': 'track_id'}, inplace=True)
    track_table['album_id'] = track_table['album_id']
    
    return df.merge(track_table, how='right', on='track_id')

track_data = merge_tracks_data(grouped_tags)

track_data

Unnamed: 0,track_id,tags,name,artist_id,album_id
0,1,folk folk folk folk folk folk folk folk countr...,twentyseven,136900,227929.0
1,2,christmas christmas christmas christmas christ...,Happy Xmas (War Is Over) (2003 Mix),82136,
2,3,wrong wrong wrong wrong wrong wrong wrong wrong,Don’t Shut Me Down,127107,
3,4,melodicdeathmetal melodicdeathmetal melodicdea...,Tirpitz,57668,144260.0
4,5,slowcore slowcore slowcore slowcore slowcore s...,You Have Cum in Your Hair and Your Dick is Han...,117533,99037.0
...,...,...,...,...,...
815626,815627,frenchpop frenchpop frenchpop frenchpop french...,D-1,71909,144316.0
815627,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,Now or Never,125328,
815628,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...,Los Zurdos Mueren Antes,142059,
815629,815630,indie indie indie indie indie indie indie indie,Oh Mama,99271,92757.0


In [19]:
PATH = f"{DATA_PATH}/track_full_data.zip"

track_data.to_csv(PATH, index=False, sep='\t')