## Generation of User-Track interactions

In [2]:
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import sessionmaker

db = create_engine(
    "postgresql://alumnodb:alumnodb@localhost:5432/lastfm_db", client_encoding="UTF-8")

metadata = MetaData()
metadata.reflect(bind=db)


In [3]:
def table(name) -> Table:
    return metadata.tables[name]

Tag = table('tag')
Album = table('album')
Artist = table('artist')
Track = table('track')
User = table('user_')

AlbumTags = table('albumtoptags')
ArtistTags = table('artisttoptags')
TrackTags = table('tracktoptags')
UserTags = table('usertoptags') # Empty, use most frequent?

UserTopAlbums = table('usertopalbums')
UserTopArtists = table('usertopartists')
UserTopTracks = table('usertoptracks')

UserRecentTracks = table('userrecenttracks')
UserLovedTracks = table('userlovedtracks')

session = sessionmaker(db)

In [4]:
import pandas as pd

def all_top_tracks():
    with session.begin() as s:
        q = (s.query(User.c.id, UserTopTracks.c.track_id, UserTopTracks.c.rank)
            .join(UserTopTracks, User.c.id == UserTopTracks.c.user_id)
            .order_by(User.c.id, UserTopTracks.c.rank).all())
        return pd.DataFrame(q, columns=['user_id', 'track_id', 'rank'])

def all_recent_tracks():
    with session.begin() as s:
        q = (s.query(User.c.id, UserRecentTracks.c.track_id, UserRecentTracks.c.listen_at)
            .join(UserRecentTracks, User.c.id == UserRecentTracks.c.user_id)
            .order_by(User.c.id, UserRecentTracks.c.listen_at).all())
        return pd.DataFrame(q, columns=['user_id', 'track_id', 'listen_at'])

def all_loved_tracks():
    with session.begin() as s:
        q = (s.query(User.c.id, UserLovedTracks.c.track_id, UserLovedTracks.c.love_at)
            .join(UserLovedTracks, User.c.id == UserLovedTracks.c.user_id)
            .order_by(User.c.id, UserLovedTracks.c.love_at).all())
        return pd.DataFrame(q, columns=['user_id', 'track_id', 'love_at'])


### 1. User's Top Tracks

In [5]:
top_tracks = all_top_tracks()
top_tracks

Unnamed: 0,user_id,track_id,rank
0,1,35151,1
1,1,560787,2
2,1,231167,3
3,1,302447,4
4,1,156880,5
...,...,...,...
1054551,52829,616532,16
1054552,52829,298170,17
1054553,52829,149950,18
1054554,52829,162448,19


In [40]:
PATH = "../data/recsys_data/top_tracks_ratings.csv"

# Convert rank into a rating from 10 to 8.1 / 10
import numpy as np

ratings = np.linspace(10, 8.1, 20, dtype=np.float32)
print(f"Ratings by rank: {ratings}")

top_track_ratings = top_tracks[['user_id', 'track_id']]
top_track_ratings['rating'] = top_tracks['rank'].apply(lambda x: ratings[x-1])

top_track_ratings

Ratings by rank: [10.   9.9  9.8  9.7  9.6  9.5  9.4  9.3  9.2  9.1  9.   8.9  8.8  8.7
  8.6  8.5  8.4  8.3  8.2  8.1]


Unnamed: 0,user_id,track_id,rating
0,1,35151,10.0
1,1,560787,9.9
2,1,231167,9.8
3,1,302447,9.7
4,1,156880,9.6
...,...,...,...
1054551,52829,616532,8.5
1054552,52829,298170,8.4
1054553,52829,149950,8.3
1054554,52829,162448,8.2


In [41]:
top_track_ratings.to_csv(PATH, index=False, sep='\t')

### 2. User's Recent Tracks

In [8]:
recent_tracks = all_recent_tracks()
recent_tracks

Unnamed: 0,user_id,track_id,listen_at
0,2,785090,2022-11-12 09:22:07
1,2,273675,2022-11-12 09:27:09
2,2,462697,2022-11-12 09:31:14
3,2,9158,2022-11-12 09:36:13
4,2,180323,2022-11-12 09:39:16
...,...,...,...
1019975,52829,554368,2022-11-15 04:27:34
1019976,52829,745290,2022-11-15 04:32:42
1019977,52829,539622,2022-11-15 04:37:47
1019978,52829,677435,2022-11-15 04:44:16


In [77]:
import numpy as np

PATH = "../data/recsys_data/recent_tracks_ratings.csv"

# Convert heard songs into a rating of from 5 to 10 based on listen count
recent_track_ratings = recent_tracks[['user_id', 'track_id']]

listen_count = recent_tracks['track_id'].value_counts()
max_listened = listen_count.max()

limits = np.linspace(max_listened, 1, 6, dtype=int)
limits = np.append(limits, 0)
ratings = [10, 9, 8, 7, 6, 5]

for i, (lim, r) in enumerate(zip(limits[:-1], ratings)):
    track_ids = listen_count[(listen_count <= lim) & (listen_count > limits[i+1])].index
    recent_track_ratings.loc[recent_track_ratings.track_id.isin(track_ids), 'rating'] = r

recent_track_ratings['timestamp'] = recent_tracks['listen_at']

recent_track_ratings

Unnamed: 0,user_id,track_id,rating,timestamp
0,2,785090,5.0,2022-11-12 09:22:07
1,2,273675,6.0,2022-11-12 09:27:09
2,2,462697,5.0,2022-11-12 09:31:14
3,2,9158,5.0,2022-11-12 09:36:13
4,2,180323,6.0,2022-11-12 09:39:16
...,...,...,...,...
1019975,52829,554368,6.0,2022-11-15 04:27:34
1019976,52829,745290,6.0,2022-11-15 04:32:42
1019977,52829,539622,6.0,2022-11-15 04:37:47
1019978,52829,677435,6.0,2022-11-15 04:44:16


In [78]:
recent_track_ratings.to_csv(PATH, index=False, sep='\t')

### 3. User's Loved Tracks

In [79]:
loved_tracks = all_loved_tracks()
loved_tracks

Unnamed: 0,user_id,track_id,love_at
0,3,762797,2022-06-27 11:29:18
1,3,169241,2022-06-27 11:29:20
2,3,237435,2022-06-27 11:29:29
3,3,708715,2022-06-27 11:29:31
4,3,695146,2022-07-05 09:50:14
...,...,...,...
377621,52829,186736,2022-11-01 14:14:07
377622,52829,30097,2022-11-01 14:14:14
377623,52829,42110,2022-11-01 14:15:02
377624,52829,59138,2022-11-01 14:19:24


In [80]:
PATH = "../data/recsys_data/loved_tracks_ratings.csv"

# Convert heard songs into a rating of 10 / 10
loved_track_ratings = loved_tracks[['user_id', 'track_id']]
loved_track_ratings['rating'] = 10
loved_track_ratings['timestamp'] = loved_tracks['love_at']

loved_track_ratings

Unnamed: 0,user_id,track_id,rating,timestamp
0,3,762797,10,2022-06-27 11:29:18
1,3,169241,10,2022-06-27 11:29:20
2,3,237435,10,2022-06-27 11:29:29
3,3,708715,10,2022-06-27 11:29:31
4,3,695146,10,2022-07-05 09:50:14
...,...,...,...,...
377621,52829,186736,10,2022-11-01 14:14:07
377622,52829,30097,10,2022-11-01 14:14:14
377623,52829,42110,10,2022-11-01 14:15:02
377624,52829,59138,10,2022-11-01 14:19:24


In [81]:
loved_track_ratings.to_csv(PATH, index=False, sep='\t')

### 4. Merge All Tracks

In [82]:
import numpy as np

PATH = "../data/recsys_data/all_tracks_ratings.csv"

merged_track_ratings = pd.concat([top_track_ratings, recent_track_ratings, loved_track_ratings], ignore_index=True)
merged_track_ratings['rating'] = merged_track_ratings['rating'].astype(np.float32)
merged_track_ratings

Unnamed: 0,user_id,track_id,rating,timestamp
0,1,35151,10.0,NaT
1,1,560787,9.9,NaT
2,1,231167,9.8,NaT
3,1,302447,9.7,NaT
4,1,156880,9.6,NaT
...,...,...,...,...
2452157,52829,186736,10.0,2022-11-01 14:14:07
2452158,52829,30097,10.0,2022-11-01 14:14:14
2452159,52829,42110,10.0,2022-11-01 14:15:02
2452160,52829,59138,10.0,2022-11-01 14:19:24


In [19]:
merged_track_ratings.to_csv(PATH, index=False, sep='\t')