# __LastFM Dataset__

## Prepare Dataset

In [1]:
from db_utils import *


## Example Queries

In [2]:
""" Tracks with vaporwave tag

select * from tracktoptags tt join track on track.id = tt.track_id join tag on tag.id = tt.tag_id where tag.name like 'vaporwave' order by track.id;
"""
with session.begin() as s:
    display(s.query(TRACK_TAGS, TRACK.c.name)
            .join(TAG, TAG.c.id == TRACK_TAGS.c.tag_id)
            .join(TRACK, TRACK.c.id == TRACK_TAGS.c.track_id)
            .filter(TAG.c.name.like('vaporwave'))
            .order_by(TRACK.c.id).all()[:10])


[(402, 191613, 2, 'METAMATERIAL'),
 (2538, 191613, 2, 'Come Back Down'),
 (4120, 191613, 3, '発見'),
 (4539, 191613, 3, '上昇'),
 (6302, 191613, 2, 'あなたと一人で数分'),
 (9765, 191613, 3, 'Warmpop'),
 (10821, 191613, 2, 'Gonna Dream 2nite.'),
 (10829, 191613, 7, 'Ecco Chamber'),
 (11415, 191613, 3, 'Better'),
 (11523, 191613, 8, '着物')]

In [4]:
""" Tags with Valence > 0.7

select * from tag where tag.vad[1] > 0.7 order by tag.vad[1];
"""
with session.begin() as s:
    display(s.query(TAG)
            .filter(TAG.c.vad[1] > 0.7)
            .order_by(TAG.c.vad[1]).all()[:10])


[(44315, 'restored', [0.7000000000000001, 0.43836363636363634, 0.5716363636363636, 1.0]),
 (73764, 'born in seoul', [0.7000000000000001, 0.4136842105263158, 0.5490526315789473, 1.0]),
 (100234, 'flex dance music', [0.7000000000000001, 0.5177857142857143, 0.5286428571428571, 0.8333333333333334]),
 (101320, 'found on erins list', [0.7000000000000001, 0.5816666666666667, 0.5708333333333333, 0.6666666666666666]),
 (95169, 'erin', [0.7000000000000001, 0.5816666666666667, 0.5708333333333333, 0.6666666666666666]),
 (112347, 'hinário', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (112306, 'hinario', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (181233, 'teteu', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (170081, 'skakanje', [0.7000294117647059, 0.40458823529411764, 0.4942058823529412, 0.9333333333333333]),
 (145510, 'north', [0.7000307692307692, 0.46478461538461535, 0.6356461538461539, 0.9016393442622951])]

In [5]:
""" Tag frequency in artists

select t.name, count(t.name) as freq from tag t join artisttoptags att on att.tag_id = t.id join artist a on a.id = att.artist_id group by t.name order by freq desc;
"""
from sqlalchemy import func, desc

with session.begin() as s:
    display(s.query(TAG.c.name, func.count())
            .join(ARTIST_TAGS, ARTIST_TAGS.c.tag_id == TAG.c.id)
            .join(ARTIST, ARTIST.c.id == ARTIST_TAGS.c.artist_id)
            .group_by(TAG.c.name)
            .order_by(desc(func.count())).all()[:10])


[('seen live', 32838),
 ('electronic', 23526),
 ('rock', 22779),
 ('pop', 20615),
 ('indie', 16805),
 ('female vocalists', 13807),
 ('alternative', 12574),
 ('hip-hop', 10549),
 ('american', 10095),
 ('experimental', 9415)]

In [6]:
""" Tag frequency in albums

select t.name, count(t.name) as freq from tag t join tracktoptags ttt on ttt.tag_id = t.id join track tr on tr.id = ttt.track_id group by t.name order by freq desc;
"""
from sqlalchemy import func, desc

with session.begin() as s:
    display(s.query(TAG.c.name, func.count())
            .join(TRACK_TAGS, TRACK_TAGS.c.tag_id == TAG.c.id)
            .join(TRACK, TRACK.c.id == TRACK_TAGS.c.track_id)
            .group_by(TAG.c.name)
            .order_by(desc(func.count())).all()[:10])


[('rock', 85199),
 ('pop', 49831),
 ('alternative', 46980),
 ('indie', 38600),
 ('electronic', 36951),
 ('metal', 35737),
 ('female vocalists', 29737),
 ('alternative rock', 28935),
 ('indie rock', 19549),
 ('classic rock', 18676)]

In [4]:
""" Top tracks from USER_TOP_TRACKS + USER_RECENT_TRACKS + USER_LOVED_TRACKS

select t.name, a.name, count(t.name) from track t join usertoptracks ut on t.id = ut.track_id join artist a on t.artist_id = a.id group by t.name, a.name order by count desc;
"""
from sqlalchemy import func, desc
from sqlalchemy.orm import Query
from collections import Counter

with session.begin() as s:
    top = Query([TRACK.c.name, ARTIST.c.name, func.count()], session=s).join(USER_TOP_TRACKS, USER_TOP_TRACKS.c.track_id == TRACK.c.id).join(
        ARTIST, ARTIST.c.id == TRACK.c.artist_id).group_by(TRACK.c.name, ARTIST.c.name).order_by(desc(func.count())).all()

    recent = Query([TRACK.c.name, ARTIST.c.name, func.count()], session=s).join(USER_RECENT_TRACKS, USER_RECENT_TRACKS.c.track_id == TRACK.c.id).join(
        ARTIST, ARTIST.c.id == TRACK.c.artist_id).group_by(TRACK.c.name, ARTIST.c.name).order_by(desc(func.count())).all()
    
    loved = Query([TRACK.c.name, ARTIST.c.name, func.count()], session=s).join(USER_LOVED_TRACKS, USER_LOVED_TRACKS.c.track_id == TRACK.c.id).join(
        ARTIST, ARTIST.c.id == TRACK.c.artist_id).group_by(TRACK.c.name, ARTIST.c.name).order_by(desc(func.count())).all()

def merge_track_artist(tup):
    return (f"{tup[0]} - {tup[1]}", tup[2])

top = dict(map(merge_track_artist, top))
recent =  dict(map(merge_track_artist, recent))
loved = dict(map(merge_track_artist, loved))

total = Counter()
for d in (top, recent, loved):
    total.update(d)

total.most_common(20)

[('Anti-Hero - Taylor Swift', 1075),
 ('As It Was - Harry Styles', 791),
 ('Glimpse of Us - Joji', 765),
 ('Karma - Taylor Swift', 654),
 ("you're On Your OWn, KId - Taylor Swift", 613),
 ('lAveNDER hAZe - Taylor Swift', 610),
 ('Maroon - Taylor Swift', 609),
 ('Bejeweled - Taylor Swift', 605),
 ('sNOw on the beach (feat. lana del rey) - Taylor Swift', 587),
 ('No Surprises - Radiohead', 583),
 ('Midnight Rain - Taylor Swift', 546),
 ('Bad Habit - Steve Lacy', 545),
 ('505 - Arctic Monkeys', 521),
 ("Would've, Could've, Should've - Taylor Swift", 518),
 ('Die For You - Joji', 510),
 ('Everlong - Foo Fighters', 498),
 ('Heat Waves - Glass Animals', 494),
 ('Blinding Lights - The Weeknd', 487),
 ('Space Song - Beach House', 483),
 ('good 4 u - Olivia Rodrigo', 475)]

In [8]:
print(f'Total user-track interactions: {sum(total.values())}')

Total user-track interactions: 2452162


## Model Selection

### General Recommendation: Using SKLearn's CountVectorizer & Cosine Similarity

In [19]:
grouped_tags = pd.read_csv("../data/recsys_data/track_full_data.zip", sep='\t')[['track_id', 'tags']].dropna().reset_index(drop=True)
track_to_idx = pd.Series(grouped_tags.index, index=grouped_tags['track_id'])

grouped_tags

Unnamed: 0,track_id,tags
0,1,folk folk folk folk folk folk folk folk countr...
1,2,christmas christmas christmas christmas christ...
2,3,wrong wrong wrong wrong wrong wrong wrong wrong
3,4,melodicdeathmetal melodicdeathmetal melodicdea...
4,5,slowcore slowcore slowcore slowcore slowcore s...
...,...,...
757597,815627,frenchpop frenchpop frenchpop frenchpop french...
757598,815628,hiphop hiphop hiphop hiphop hiphop hiphop hiph...
757599,815629,hiphop hiphop hiphop hiphop hiphop hiphop hiph...
757600,815630,indie indie indie indie indie indie indie indie


#### Computing Vectors and Cosine Similarities

In [20]:
# Ref: https://www.datacamp.com/tutorial/recommender-systems-python

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Ignore tags with doc freq < df
vec = TfidfVectorizer(min_df=1, binary=True)
# vec = CountVectorizer(min_df=2)
tag_matrix = vec.fit_transform(grouped_tags['tags'])


In [21]:
track_count, vocab_size = tag_matrix.shape

print(f"Vocabulary of size {vocab_size} for {track_count} tracks")

Vocabulary of size 70858 for 757602 tracks


In [22]:
all_tracks_ratings = pd.read_csv("../data/recsys_data/all_tracks_ratings_full.zip", sep='\t')

all_tracks_ratings

Unnamed: 0,user_id,track_id,rating,timestamp
0,1,35151,5.0,
1,1,82497,3.3,
2,1,97967,4.2,
3,1,105492,3.5,
4,1,124021,3.9,
...,...,...,...,...
2200756,52829,711172,4.2,
2200757,52829,745290,1.0,2022-11-15 04:32:42
2200758,52829,776707,1.0,2022-11-14 17:46:56
2200759,52829,781335,1.0,2022-11-14 17:37:05


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_track_weights(username, track_ids):
    uid = get_user_id(username)
    user_tracks = all_tracks_ratings[all_tracks_ratings['user_id'] == uid].copy()

    # Get ids for recommendation
    user_tracks = user_tracks[user_tracks['track_id'].isin(track_ids)]

    # Normalized ratings as weights
    if len(user_tracks) > 1:
        user_tracks['weight'] = normalize(user_tracks['rating']) + 1  # Avoid cancelling minimum rating
    else:
        user_tracks['weight'] = 1
    return user_tracks[['track_id', 'weight']].sort_values('track_id')

def get_recommendations(username, cutoff=10, user_track_type='Recent' or 'Top' or 'Loved' or 'All', user_track_cutoff=None, weighted=True):
    # Get user tracks
    if user_track_type == 'Recent':
        r_tracks = user_recent_tracks(username)
    elif user_track_type == 'Top':
        r_tracks = user_top_tracks(username)
    elif user_track_type == 'Loved':
        r_tracks = user_loved_tracks(username)
    else:
        r_tracks = pd.concat([user_recent_tracks(username), user_top_tracks(username), user_loved_tracks(username)])
    
    if len(r_tracks) == 0:
        print(f"No tracks of type {user_track_type} from user {username}")
        return

    # Use only tracks with tags
    usable_tracks = r_tracks[r_tracks.isin(grouped_tags['track_id'])].sort_values()

    # Get track weights
    track_weights = get_track_weights(username, usable_tracks)
    top_k_idx = np.argsort(-track_weights['weight'].values)[:user_track_cutoff]

    # Top cutoff tracks by rating
    track_weights = track_weights['weight'].values[top_k_idx]
    usable_tracks = usable_tracks.values[top_k_idx]

    print(f"{r_tracks.size} tracks, from which {usable_tracks.size} have tags")

    # Filter with grouped_tags
    track_idx = track_to_idx.loc[usable_tracks]
    recommend_tags = grouped_tags.loc[track_idx]

    # Vectorize user tracks
    recommend_matrix = vec.transform(recommend_tags['tags'])
    print(f"Recommendation matrix shape: {recommend_matrix.shape}")

    # Compute the Cosine Similarity matrix based on the count_matrix
    similarities = cosine_similarity(tag_matrix, recommend_matrix)

    # Nullify songs used for recommendation (highest similarity)
    similarities[track_idx] = 0

    # Average similarity for each song, weighted by rating
    agg_similarities = np.average(similarities, weights=track_weights if weighted else None, axis=1)

    # Pairwise scores to each track, ordered by total aggregated similarity
    tup_scores = list(enumerate(agg_similarities))
    tup_scores.sort(key=lambda x: -x[1])
    rec_idx, scores = zip(*tup_scores[:cutoff])

    recomms = grouped_tags.loc[list(rec_idx)].copy()
    recomms['tags'] = recomms['tags'].apply(lambda x: ' '.join(pd.Series(x.split(' ')).unique()))
    recomms['track_name'] = list(map(get_track_name, recomms['track_id']))
    recomms['scores'] = scores

    return recomms

In [24]:
get_recommendations("Abbygoulding", user_track_cutoff=None, cutoff=10, user_track_type='All', weighted=False)


40 tracks, from which 37 have tags
Recommendation matrix shape: (37, 70858)


Unnamed: 0,track_id,tags,track_name,scores
698679,752140,pop kpop korean femalevocalists electronic ele...,Outta My Head - JEON SOMI,0.233955
2421,2621,kpop pop korean electronic seenlive indiepop d...,Bend the Rules - CIX,0.20372
11101,11975,kpop pop korean electronic seenlive indiepop d...,"TESSERACT (Prod. HUI, Minit) - CIX",0.20372
26367,28381,kpop pop korean electronic seenlive indiepop d...,Here For You - CIX,0.20372
71051,76530,kpop pop korean electronic seenlive indiepop d...,Wondering - CIX,0.20372
77217,83185,kpop pop korean electronic seenlive indiepop d...,Confession - CIX,0.20372
79260,85423,kpop pop korean electronic seenlive indiepop d...,Cinema - Japanese Ver. - CIX,0.20372
79419,85593,kpop pop korean electronic seenlive indiepop d...,Off My Mind - CIX,0.20372
97753,105331,kpop pop korean electronic seenlive indiepop d...,In & Out - CIX,0.20372
148004,159449,kpop pop korean electronic seenlive indiepop d...,Move My Body - CIX,0.20372


### Model Libraries

Tried recommender systems from 3 libraries:
1. Surprise (`surprise_research.ipynb`)
2. Cornac (`cornac_research/`)
3. Recbole (`recbole_research/`)
