# __LastFM Dataset__

## Prepare Dataset

In [2]:
from db_utils import *


## Example Queries

In [2]:
""" Tracks with vaporwave tag

select * from tracktoptags tt join track on track.id = tt.track_id join tag on tag.id = tt.tag_id where tag.name like 'vaporwave' order by track.id;
"""
with session.begin() as s:
    display(s.query(TRACK_TAGS, TRACK.c.name)
            .join(TAG, TAG.c.id == TRACK_TAGS.c.tag_id)
            .join(TRACK, TRACK.c.id == TRACK_TAGS.c.track_id)
            .filter(TAG.c.name.like('vaporwave'))
            .order_by(TRACK.c.id).all()[:10])


[(402, 191613, 2, 'METAMATERIAL'),
 (2538, 191613, 2, 'Come Back Down'),
 (4120, 191613, 3, '発見'),
 (4539, 191613, 3, '上昇'),
 (6302, 191613, 2, 'あなたと一人で数分'),
 (9765, 191613, 3, 'Warmpop'),
 (10821, 191613, 2, 'Gonna Dream 2nite.'),
 (10829, 191613, 7, 'Ecco Chamber'),
 (11415, 191613, 3, 'Better'),
 (11523, 191613, 8, '着物')]

In [4]:
""" Tags with Valence > 0.7

select * from tag where tag.vad[1] > 0.7 order by tag.vad[1];
"""
with session.begin() as s:
    display(s.query(TAG)
            .filter(TAG.c.vad[1] > 0.7)
            .order_by(TAG.c.vad[1]).all()[:10])


[(44315, 'restored', [0.7000000000000001, 0.43836363636363634, 0.5716363636363636, 1.0]),
 (73764, 'born in seoul', [0.7000000000000001, 0.4136842105263158, 0.5490526315789473, 1.0]),
 (100234, 'flex dance music', [0.7000000000000001, 0.5177857142857143, 0.5286428571428571, 0.8333333333333334]),
 (101320, 'found on erins list', [0.7000000000000001, 0.5816666666666667, 0.5708333333333333, 0.6666666666666666]),
 (95169, 'erin', [0.7000000000000001, 0.5816666666666667, 0.5708333333333333, 0.6666666666666666]),
 (112347, 'hinário', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (112306, 'hinario', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (181233, 'teteu', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (170081, 'skakanje', [0.7000294117647059, 0.40458823529411764, 0.4942058823529412, 0.9333333333333333]),
 (145510, 'north', [0.7000307692307692, 0.46478461538461535, 0.6356461538461539, 0.9016393442622951])]

In [5]:
""" Tag frequency in artists

select t.name, count(t.name) as freq from tag t join artisttoptags att on att.tag_id = t.id join artist a on a.id = att.artist_id group by t.name order by freq desc;
"""
from sqlalchemy import func, desc

with session.begin() as s:
    display(s.query(TAG.c.name, func.count())
            .join(ARTIST_TAGS, ARTIST_TAGS.c.tag_id == TAG.c.id)
            .join(ARTIST, ARTIST.c.id == ARTIST_TAGS.c.artist_id)
            .group_by(TAG.c.name)
            .order_by(desc(func.count())).all()[:10])


[('seen live', 32838),
 ('electronic', 23526),
 ('rock', 22779),
 ('pop', 20615),
 ('indie', 16805),
 ('female vocalists', 13807),
 ('alternative', 12574),
 ('hip-hop', 10549),
 ('american', 10095),
 ('experimental', 9415)]

In [6]:
""" Tag frequency in albums

select t.name, count(t.name) as freq from tag t join tracktoptags ttt on ttt.tag_id = t.id join track tr on tr.id = ttt.track_id group by t.name order by freq desc;
"""
from sqlalchemy import func, desc

with session.begin() as s:
    display(s.query(TAG.c.name, func.count())
            .join(TRACK_TAGS, TRACK_TAGS.c.tag_id == TAG.c.id)
            .join(TRACK, TRACK.c.id == TRACK_TAGS.c.track_id)
            .group_by(TAG.c.name)
            .order_by(desc(func.count())).all()[:10])


[('rock', 85199),
 ('pop', 49831),
 ('alternative', 46980),
 ('indie', 38600),
 ('electronic', 36951),
 ('metal', 35737),
 ('female vocalists', 29737),
 ('alternative rock', 28935),
 ('indie rock', 19549),
 ('classic rock', 18676)]

In [4]:
""" Top tracks from USER_TOP_TRACKS + USER_RECENT_TRACKS + USER_LOVED_TRACKS

select t.name, a.name, count(t.name) from track t join usertoptracks ut on t.id = ut.track_id join artist a on t.artist_id = a.id group by t.name, a.name order by count desc;
"""
from sqlalchemy import func, desc
from sqlalchemy.orm import Query
from collections import Counter

with session.begin() as s:
    top = Query([TRACK.c.name, ARTIST.c.name, func.count()], session=s).join(USER_TOP_TRACKS, USER_TOP_TRACKS.c.track_id == TRACK.c.id).join(
        ARTIST, ARTIST.c.id == TRACK.c.artist_id).group_by(TRACK.c.name, ARTIST.c.name).order_by(desc(func.count())).all()

    recent = Query([TRACK.c.name, ARTIST.c.name, func.count()], session=s).join(USER_RECENT_TRACKS, USER_RECENT_TRACKS.c.track_id == TRACK.c.id).join(
        ARTIST, ARTIST.c.id == TRACK.c.artist_id).group_by(TRACK.c.name, ARTIST.c.name).order_by(desc(func.count())).all()
    
    loved = Query([TRACK.c.name, ARTIST.c.name, func.count()], session=s).join(USER_LOVED_TRACKS, USER_LOVED_TRACKS.c.track_id == TRACK.c.id).join(
        ARTIST, ARTIST.c.id == TRACK.c.artist_id).group_by(TRACK.c.name, ARTIST.c.name).order_by(desc(func.count())).all()

def merge_track_artist(tup):
    return (f"{tup[0]} - {tup[1]}", tup[2])

top = dict(map(merge_track_artist, top))
recent =  dict(map(merge_track_artist, recent))
loved = dict(map(merge_track_artist, loved))

total = Counter()
for d in (top, recent, loved):
    total.update(d)

total.most_common(20)

[('Anti-Hero - Taylor Swift', 1075),
 ('As It Was - Harry Styles', 791),
 ('Glimpse of Us - Joji', 765),
 ('Karma - Taylor Swift', 654),
 ("you're On Your OWn, KId - Taylor Swift", 613),
 ('lAveNDER hAZe - Taylor Swift', 610),
 ('Maroon - Taylor Swift', 609),
 ('Bejeweled - Taylor Swift', 605),
 ('sNOw on the beach (feat. lana del rey) - Taylor Swift', 587),
 ('No Surprises - Radiohead', 583),
 ('Midnight Rain - Taylor Swift', 546),
 ('Bad Habit - Steve Lacy', 545),
 ('505 - Arctic Monkeys', 521),
 ("Would've, Could've, Should've - Taylor Swift", 518),
 ('Die For You - Joji', 510),
 ('Everlong - Foo Fighters', 498),
 ('Heat Waves - Glass Animals', 494),
 ('Blinding Lights - The Weeknd', 487),
 ('Space Song - Beach House', 483),
 ('good 4 u - Olivia Rodrigo', 475)]

In [8]:
print(f'Total user-track interactions: {sum(total.values())}')

Total user-track interactions: 2452162


## Building Model

### Using SKLearn's CountVectorizer & Cosine Similarity

#### Option 1: Using Tracks own Tags (More specific, less tracks, more sparse)

In [3]:
raw_track_tags = get_track_own_tags()
print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags.head(5)

Unique tracks: 363140


Unnamed: 0,track_id,tag,rank
0,2,christmas,1
1,2,xmas,2
2,2,john lennon,3
3,2,tinsel,4
4,2,x-mas,5


#### Option 2: Using Track Artist's Tags (Less specific, more tracks, less sparse)

In [4]:
raw_track_tags = get_track_artist_tags()
print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags.head(5)

Unique tracks: 759923


Unnamed: 0,track_id,tag,rank
0,1,folk,1
1,1,country,2
2,1,psychedelic folk,3
3,1,rock,4
4,1,american,5


#### Option 3: Using Track Album's Tags (Middle ground, though less tracks)

In [5]:
raw_track_tags = get_track_album_tags()
print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags.head(5)

Unique tracks: 319156


Unnamed: 0,track_id,tag,rank
0,4,melodic death metal,1
1,4,melodic deathcore,2
2,4,best albums 2020,3
3,5,albini,1
4,5,pop topp 30 1996,2


#### Option 4: Merging Track and Track Artist/Album's Tags 

In [6]:
track_own_tags = get_track_own_tags()
track_artist_tags = get_track_artist_tags()
track_album_tags = get_track_album_tags()

In [7]:
# Remove irrelevant tags
track_own_tags = track_own_tags[['track' not in t for t in track_own_tags.tag]]
track_album_tags = track_album_tags[['album' not in t for t in track_album_tags.tag]]
track_artist_tags = track_artist_tags[['artist' not in t for t in track_artist_tags.tag]]

# Remove tags with just one track
track_own_tags = track_own_tags[track_own_tags.groupby('tag').tag.transform('count') > 1]
track_album_tags = track_album_tags[track_album_tags.groupby('tag').tag.transform('count') > 1]
track_artist_tags = track_artist_tags[track_artist_tags.groupby('tag').tag.transform('count') > 1]

# Merge tags not in set by priority -> track > album > artist
new_tracks_album = track_album_tags.loc[~track_album_tags.track_id.isin(track_own_tags.track_id)]
raw_track_tags = pd.concat([track_own_tags, new_tracks_album])

new_tracks_artist = track_artist_tags.loc[~track_artist_tags.track_id.isin(raw_track_tags.track_id)]
raw_track_tags = pd.concat([raw_track_tags, new_tracks_artist]).reset_index(drop=True)

print(f"Unique tracks: {len(raw_track_tags.track_id.unique())}")
raw_track_tags

Unique tracks: 761171


Unnamed: 0,track_id,tag,rank
0,2,christmas,1
1,2,xmas,2
2,2,john lennon,3
3,2,tinsel,4
4,2,x-mas,5
...,...,...,...
5495780,815629,spanish rap,6
5495781,815629,nach,7
5495782,815629,hiphop,8
5495783,815629,spanish hip hop,9


#### Pre-processing

In [8]:
# Strip spaces and multiply words depending on rank
track_tags = raw_track_tags.copy()
track_tags['tag'] = track_tags['tag'].apply(lambda x: x.replace(' ', ''))
track_tags = track_tags.loc[track_tags.index.repeat(track_tags['rank'].apply(lambda x: 11 - x))]

# Group by track id and aggregate into lists
grouped_tags = track_tags.groupby(track_tags['track_id'])['tag'].apply(list).reset_index(name='tags')

# Create soup of words for each track
grouped_tags['tags'] = grouped_tags['tags'].apply(lambda x: ' '.join(x))

grouped_tags

Unnamed: 0,track_id,tags
0,1,folk folk folk folk folk folk folk folk folk f...
1,2,christmas christmas christmas christmas christ...
2,3,wrong wrong wrong wrong wrong wrong wrong wron...
3,4,melodicdeathmetal melodicdeathmetal melodicdea...
4,5,slowcore slowcore slowcore slowcore slowcore s...
...,...,...
761166,815627,frenchpop frenchpop frenchpop frenchpop french...
761167,815628,hip-hop hip-hop hip-hop hip-hop hip-hop hip-ho...
761168,815629,hip-hop hip-hop hip-hop hip-hop hip-hop hip-ho...
761169,815630,indie indie indie indie indie indie indie indi...


#### Computing Vectors and Cosine Similarities

In [12]:
# Ref: https://www.datacamp.com/tutorial/recommender-systems-python

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count = CountVectorizer()
count_matrix = count.fit_transform(grouped_tags['tags'])


In [13]:
track_count, vocab_size = count_matrix.shape

print(f"Vocabulary of size {vocab_size} for {track_count} tracks")

Vocabulary of size 72706 for 761171 tracks


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_recommendations(username, tracks_getter, cutoff=10, user_track_cutoff=None):
    # Get recent tracks of a user
    r_tracks = tracks_getter(username)[:user_track_cutoff]

    # Use only tracks with tags
    usable_tracks = r_tracks[r_tracks.isin(grouped_tags['track_id'])]
    print(f"{r_tracks.size} tracks, from which {usable_tracks.size} have tags")

    # Filter with grouped_tags
    indices = pd.Series(grouped_tags.index, index=grouped_tags['track_id'])
    track_idx = indices.loc[usable_tracks]
    recommend_tags = grouped_tags.loc[track_idx]

    # Vectorize user tracks
    recommend_matrix = count.transform(recommend_tags['tags'])
    print(f"Recommendation matrix shape: {recommend_matrix.shape}")

    # Compute the Cosine Similarity matrix based on the count_matrix
    similarities = cosine_similarity(count_matrix, recommend_matrix)
    
    # Delete songs used for recommendation (highest similarity)
    similarities = np.delete(similarities, track_idx, axis=0)

    # Average similarity for each song
    agg_similarities = similarities.mean(axis=1)

    # Pairwise scores to each track, ordered by total aggregated similarity
    tup_scores = list(enumerate(agg_similarities))
    tup_scores.sort(key=lambda x: -x[1])
    rec_idx, scores = zip(*tup_scores[:cutoff])
    
    recomms = grouped_tags.loc[list(rec_idx)]
    recomms['tags'] = recomms['tags'].apply(lambda x: ' '.join(pd.Series(x.split(' ')).unique()))
    recomms['track_name'] = recomms['track_id'].apply(get_track_name)
    recomms['scores'] = scores
    
    return recomms

In [30]:
get_recommendations("Abbygoulding", user_recent_tracks, user_track_cutoff=4, cutoff=10)


4 tracks, from which 4 have tags
Recommendation matrix shape: (4, 72706)


Unnamed: 0,track_id,tags,track_name,scores
618542,662750,drumandbass jungle dnb drumnbass electronic se...,Show Me (feat. Sneaky Sound System) - Cause & ...,0.614125
672011,720038,trap brazil brazilian crushedtrap hip-hop rap ...,Mentiroza - Sidoka,0.596691
307361,329346,chillout electronic trap unitedstates futureba...,Black Hole Sun - Prismo,0.586888
618919,663156,hip-hop rap seenlive hiphop political german c...,Wie viel ist dein Outfit wert - Kummer,0.583466
22058,23619,k-pop electropop pop dance-pop electrohouse sy...,LOVE DIVE - IVE,0.576433
234924,251811,krautrock psychedelic progressiverock psychede...,Amboss - Ash Ra Tempel,0.576429
298239,319579,progressivehouse upliftinghouse,Bromance - Avicii's Arena Mix - Tim Berg,0.576022
498701,534387,poland,mē - Enkei,0.576022
107193,114985,kpop pop k-pop 2020 korean trap hiphop hip-hop...,How You Like That - BLACKPINK,0.575022
42342,45348,k-pop electronic electropop pop dance-pop hous...,PLAY - CHUNG HA,0.574193


recommend wtih tags for those tracks not in colab filtering

### Using Recbole