# __LastFM Dataset__

## Prepare Dataset

In [1]:
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import sessionmaker

db = create_engine(
    "postgresql://alumnodb:alumnodb@localhost:5432/lastfm_db", client_encoding="UTF-8")

metadata = MetaData()
metadata.reflect(bind=db)


In [2]:
def table(name) -> Table:
    return metadata.tables[name]

Tag = table('tag')
Album = table('album')
Artist = table('artist')
Track = table('track')
User = table('user_')

AlbumTags = table('albumtoptags')
ArtistTags = table('artisttoptags')
TrackTags = table('tracktoptags')
UserTags = table('usertoptags') # Empty, use most frequent?

UserTopAlbums = table('usertopalbums')
UserTopArtists = table('usertopartists')
UserTopTracks = table('usertoptracks')

UserRecentTracks = table('userrecenttracks')
UserLovedTracks = table('userlovedtracks')

session = sessionmaker(db)

## Example Queries

In [3]:
""" Tracks with vaporwave tag

select * from tracktoptags tt join track on track.id = tt.track_id join tag on tag.id = tt.tag_id where tag.name like 'vaporwave' order by track.id;
"""
with session.begin() as s:
    display(s.query(TrackTags, Track.c.name)
            .join(Tag, Tag.c.id == TrackTags.c.tag_id)
            .join(Track, Track.c.id == TrackTags.c.track_id)
            .filter(Tag.c.name.like('vaporwave'))
            .order_by(Track.c.id).all()[:10])


[(402, 191613, 2, 'METAMATERIAL'),
 (2538, 191613, 2, 'Come Back Down'),
 (4120, 191613, 3, '発見'),
 (4539, 191613, 3, '上昇'),
 (6302, 191613, 2, 'あなたと一人で数分'),
 (9765, 191613, 3, 'Warmpop'),
 (10821, 191613, 2, 'Gonna Dream 2nite.'),
 (10829, 191613, 7, 'Ecco Chamber'),
 (11415, 191613, 3, 'Better'),
 (11523, 191613, 8, '着物')]

In [4]:
""" Tags with Valence > 0.7

select * from tag where tag.vad[1] > 0.7 order by tag.vad[1];
"""
with session.begin() as s:
    display(s.query(Tag)
            .filter(Tag.c.vad[1] > 0.7)
            .order_by(Tag.c.vad[1]).all()[:10])


[(44315, 'restored', [0.7000000000000001, 0.43836363636363634, 0.5716363636363636, 1.0]),
 (73764, 'born in seoul', [0.7000000000000001, 0.4136842105263158, 0.5490526315789473, 1.0]),
 (100234, 'flex dance music', [0.7000000000000001, 0.5177857142857143, 0.5286428571428571, 0.8333333333333334]),
 (101320, 'found on erins list', [0.7000000000000001, 0.5816666666666667, 0.5708333333333333, 0.6666666666666666]),
 (95169, 'erin', [0.7000000000000001, 0.5816666666666667, 0.5708333333333333, 0.6666666666666666]),
 (112347, 'hinário', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (112306, 'hinario', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (181233, 'teteu', [0.700001923076923, 0.458, 0.6197076923076923, 0.8305084745762712]),
 (170081, 'skakanje', [0.7000294117647059, 0.40458823529411764, 0.4942058823529412, 0.9333333333333333]),
 (145510, 'north', [0.7000307692307692, 0.46478461538461535, 0.6356461538461539, 0.9016393442622951])]

In [5]:
""" Tag frequency in artists

select t.name, count(t.name) as freq from tag t join artisttoptags att on att.tag_id = t.id join artist a on a.id = att.artist_id group by t.name order by freq desc;
"""
from sqlalchemy import func, desc

with session.begin() as s:
    display(s.query(Tag.c.name, func.count())
            .join(ArtistTags, ArtistTags.c.tag_id == Tag.c.id)
            .join(Artist, Artist.c.id == ArtistTags.c.artist_id)
            .group_by(Tag.c.name)
            .order_by(desc(func.count())).all()[:10])


[('seen live', 32838),
 ('electronic', 23526),
 ('rock', 22779),
 ('pop', 20615),
 ('indie', 16805),
 ('female vocalists', 13807),
 ('alternative', 12574),
 ('hip-hop', 10549),
 ('american', 10095),
 ('experimental', 9415)]

In [6]:
""" Tag frequency in albums

select t.name, count(t.name) as freq from tag t join tracktoptags ttt on ttt.tag_id = t.id join track tr on tr.id = ttt.track_id group by t.name order by freq desc;
"""
from sqlalchemy import func, desc

with session.begin() as s:
    display(s.query(Tag.c.name, func.count())
            .join(TrackTags, TrackTags.c.tag_id == Tag.c.id)
            .join(Track, Track.c.id == TrackTags.c.track_id)
            .group_by(Tag.c.name)
            .order_by(desc(func.count())).all()[:10])


[('rock', 85199),
 ('pop', 49831),
 ('alternative', 46980),
 ('indie', 38600),
 ('electronic', 36951),
 ('metal', 35737),
 ('female vocalists', 29737),
 ('alternative rock', 28935),
 ('indie rock', 19549),
 ('classic rock', 18676)]

In [7]:
""" Top tracks from UserTopTracks + UserRecentTracks + UserLovedTracks

select t.name, a.name, count(t.name) from track t join usertoptracks ut on t.id = ut.track_id join artist a on t.artist_id = a.id group by t.name, a.name order by count desc;
"""
from sqlalchemy import func, desc
from sqlalchemy.orm import Query
from collections import Counter

with session.begin() as s:
    top = Query([Track.c.name, Artist.c.name, func.count()], session=s).join(UserTopTracks, UserTopTracks.c.track_id == Track.c.id).join(
        Artist, Artist.c.id == Track.c.artist_id).group_by(Track.c.name, Artist.c.name).order_by(desc(func.count())).all()

    recent = Query([Track.c.name, Artist.c.name, func.count()], session=s).join(UserRecentTracks, UserRecentTracks.c.track_id == Track.c.id).join(
        Artist, Artist.c.id == Track.c.artist_id).group_by(Track.c.name, Artist.c.name).order_by(desc(func.count())).all()
    
    loved = Query([Track.c.name, Artist.c.name, func.count()], session=s).join(UserLovedTracks, UserLovedTracks.c.track_id == Track.c.id).join(
        Artist, Artist.c.id == Track.c.artist_id).group_by(Track.c.name, Artist.c.name).order_by(desc(func.count())).all()

def merge_track_artist(tup):
    return (f"{tup[0]} - {tup[1]}", tup[2])

top = dict(map(merge_track_artist, top))
recent =  dict(map(merge_track_artist, recent))
loved = dict(map(merge_track_artist, loved))

total = Counter()
for d in (top, recent, loved):
    total.update(d)

total.most_common(20)

[('Anti-Hero - Taylor Swift', 1075),
 ('As It Was - Harry Styles', 791),
 ('Glimpse of Us - Joji', 765),
 ('Karma - Taylor Swift', 654),
 ("you're On Your OWn, KId - Taylor Swift", 613),
 ('lAveNDER hAZe - Taylor Swift', 610),
 ('Maroon - Taylor Swift', 609),
 ('Bejeweled - Taylor Swift', 605),
 ('sNOw on the beach (feat. lana del rey) - Taylor Swift', 587),
 ('No Surprises - Radiohead', 583),
 ('Midnight Rain - Taylor Swift', 546),
 ('Bad Habit - Steve Lacy', 545),
 ('505 - Arctic Monkeys', 521),
 ("Would've, Could've, Should've - Taylor Swift", 518),
 ('Die For You - Joji', 510),
 ('Everlong - Foo Fighters', 498),
 ('Heat Waves - Glass Animals', 494),
 ('Blinding Lights - The Weeknd', 487),
 ('Space Song - Beach House', 483),
 ('good 4 u - Olivia Rodrigo', 475)]

In [8]:
print(f'Total user-track interactions: {sum(total.values())}')

Total user-track interactions: 2452162


## Building Model

### Using SKLearn's CountVectorizer & Cosine Similarity

#### Option 1: Using Tracks own Tags (More specific, less tracks)

In [12]:
import pandas as pd

with session.begin() as s:
    raw_track_tags = (s.query(Track.c.id, Tag.c.name, TrackTags.c.rank)
                  .join(TrackTags, TrackTags.c.track_id == Track.c.id)
                  .join(Tag, Tag.c.id == TrackTags.c.tag_id)
                  .order_by(Track.c.id, TrackTags.c.rank).all())

raw_track_tags = pd.DataFrame(raw_track_tags, columns=['track_id', 'tag', 'rank'])

raw_track_tags.head(5)


Unnamed: 0,track_id,tag,priority
0,2,christmas,1
1,2,xmas,2
2,2,john lennon,3
3,2,tinsel,4
4,2,x-mas,5


#### Option 2: Using Track Artist's Tags (Less specific, more tracks)

In [3]:
import pandas as pd

with session.begin() as s:
    raw_track_tags = (s.query(Track.c.id, Tag.c.name, ArtistTags.c.rank)
                  .join(ArtistTags, ArtistTags.c.artist_id == Track.c.artist_id)
                  .join(Tag, Tag.c.id == ArtistTags.c.tag_id)
                  .order_by(Track.c.id, ArtistTags.c.rank).all())

raw_track_tags = pd.DataFrame(raw_track_tags, columns=['track_id', 'tag', 'rank'])

raw_track_tags.head(5)


Unnamed: 0,track_id,tag,priority
0,1,folk,1
1,1,country,2
2,1,psychedelic folk,3
3,1,rock,4
4,1,american,5


#### Pre-processing

In [4]:
# Strip spaces and multiply words depending on rank
track_tags = raw_track_tags.copy()
track_tags['tag'] = track_tags['tag'].apply(lambda x: x.replace(' ', ''))
track_tags = track_tags.loc[track_tags.index.repeat(track_tags['rank'].apply(lambda x: 11 - x))]

# Group by track id and aggregate into lists
grouped_tags = track_tags.groupby(track_tags['track_id'])['tag'].apply(list).reset_index(name='tags')

# Create soup of words for each track
grouped_tags['tags'] = grouped_tags['tags'].apply(lambda x: ' '.join(x))

grouped_tags

Unnamed: 0,track_id,tags
0,1,folk folk folk folk folk folk folk folk folk f...
1,2,classicrock classicrock classicrock classicroc...
2,3,pop pop pop pop pop pop pop pop pop pop disco ...
3,4,metalcore metalcore metalcore metalcore metalc...
4,5,alt-country alt-country alt-country alt-countr...
...,...,...
759918,815627,pop pop pop pop pop pop pop pop pop pop female...
759919,815628,hip-hop hip-hop hip-hop hip-hop hip-hop hip-ho...
759920,815629,hip-hop hip-hop hip-hop hip-hop hip-hop hip-ho...
759921,815630,indie indie indie indie indie indie indie indi...


#### Computing Vectors and Cosine Similarities

In [60]:
# Ref: https://www.datacamp.com/tutorial/recommender-systems-python

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count = CountVectorizer()
count_matrix = count.fit_transform(grouped_tags['tags'])


In [61]:
track_count, vocab_size = count_matrix.shape

print(f"Vocabulary of size {vocab_size} for {track_count} tracks")

Vocabulary of size 50903 for 759923 tracks


In [62]:
def user_recent_tracks(username):
    with session.begin() as s:
        q = (s.query(UserRecentTracks.c.track_id)
            .join(User, User.c.id == UserRecentTracks.c.user_id)
            .filter(User.c.username == username)
            .order_by(UserRecentTracks.c.listen_at).all())
        return pd.DataFrame(q, columns=['track_id'])['track_id']

def user_top_tracks(username):
    with session.begin() as s:
        q = (s.query(UserTopTracks.c.track_id)
            .join(User, User.c.id == UserTopTracks.c.user_id)
            .filter(User.c.username == username).
            order_by(UserTopTracks.c.rank).all())
        return pd.DataFrame(q, columns=['track_id'])['track_id']

def user_loved_tracks(username):
    with session.begin() as s:
        q = (s.query(UserLovedTracks.c.track_id)
            .join(User, User.c.id == UserLovedTracks.c.user_id)
            .filter(User.c.username == username)
            .order_by(UserLovedTracks.c.love_at).all())
        return pd.DataFrame(q, columns=['track_id'])['track_id']

def get_track_name(id):
    with session.begin() as s:
        q = (s.query(Track.c.name, Artist.c.name)
                .join(Artist, Artist.c.id == Track.c.artist_id)
                .filter(Track.c.id == id).all())[0]
        return ' - '.join(q)


In [63]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_recommendations(username, tracks_getter, cutoff=10, user_track_cutoff=None):
    # Get recent tracks of a user
    r_tracks = tracks_getter(username)[:user_track_cutoff]

    # Use only tracks with tags
    usable_tracks = r_tracks[r_tracks.isin(grouped_tags['track_id'])]
    print(f"{r_tracks.size} tracks, from which {usable_tracks.size} have tags")

    # Filter with grouped_tags
    indices = pd.Series(grouped_tags.index, index=grouped_tags['track_id'])
    track_idx = indices.loc[usable_tracks]
    recommend_tags = grouped_tags.loc[track_idx]

    # Vectorize user tracks
    recommend_matrix = count.transform(recommend_tags['tags'])
    print(f"Recommendation matrix shape: {recommend_matrix.shape}")

    # Compute the Cosine Similarity matrix based on the count_matrix
    similarities = cosine_similarity(count_matrix, recommend_matrix)
    
    # Delete songs used for recommendation (highest similarity)
    similarities = np.delete(similarities, track_idx, axis=0)

    # Average similarity for each song
    agg_similarities = similarities.mean(axis=1)

    # Pairwise scores to each track, ordered by total aggregated similarity
    tup_scores = list(enumerate(agg_similarities))
    tup_scores.sort(key=lambda x: -x[1])
    rec_idx, scores = zip(*tup_scores[:cutoff])
    
    recomms = grouped_tags.loc[list(rec_idx)]
    recomms['tags'] = recomms['tags'].apply(lambda x: ' '.join(pd.Series(x.split(' ')).unique()))
    recomms['track_name'] = recomms['track_id'].apply(get_track_name)
    recomms['scores'] = scores
    
    return recomms

In [64]:
get_recommendations("AngwinDental", user_loved_tracks, user_track_cutoff=None, cutoff=10)


20 tracks, from which 18 have tags
Recommendation matrix shape: (18, 50903)


Unnamed: 0,track_id,tags,track_name,scores
150844,161999,indie indierock indiepop australian upbeat ele...,Fire Alarm - Castlecomer,0.34497
159995,171819,seenlive indie indiepop pop rock british indie...,Powerlines - Cassia,0.34497
502814,539661,electronic seenlive alternative rock indie bel...,Slowdance - Soulwax,0.34497
526490,565051,electronic trance progressivehouse breakbeat h...,Killa - Way Out West,0.34497
5668,6106,rock pop classicrock singer-songwriter british...,Why Should I Cry For You? - Sting,0.342663
9303,10010,rock pop classicrock singer-songwriter british...,Fields Of Gold - My Songs Version - Sting,0.342663
15579,16740,rock pop classicrock singer-songwriter british...,A Thousand Years - Sting,0.342663
18488,19827,rock pop classicrock singer-songwriter british...,The Soul Cages - Sting,0.342663
20968,22486,rock pop classicrock singer-songwriter british...,The Pirate's Bride - Sting,0.342663
21072,22601,rock pop classicrock singer-songwriter british...,The End Of The Game - Sting,0.342663


### Using Surprise

In [None]:
import os

from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser("../data/recsys_data/recenttracks.csv")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_file(file_path, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

### Using Recbole