In [317]:
import csv
import pandas as pd
import numpy as np
inter_path = '/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5.txt'
demo_path = '/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/user_demographics.txt'
tracks_path = '/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/song_ids.txt'

In [318]:
inter = pd.read_csv(inter_path, sep='\t', names=['user_id', 'track_id', 'play_count'])
demo = pd.read_csv(demo_path, sep='\t', names=['user_name', 'country', 'age', 'gender', 'timestamp'])
tracks = pd.read_csv(tracks_path,sep='\t',names=['track_artist','track_name'],na_values='',na_filter=False,error_bad_lines=False,quoting=csv.QUOTE_NONE)
print('Users {} - Items {}'.format(inter.user_id.nunique(),inter.track_id.nunique()))

Users 23272 - Items 1606686


### Sampling tracks uniformly at random

In [319]:
track_ids =  np.arange(inter.track_id.max()+1)

# Sampling procedure
sampled_items = 100000
np.random.seed(42)
np.random.shuffle(track_ids)
sampled_track_ids = track_ids[:sampled_items]
print(sampled_track_ids)
sub_inter = inter[inter.track_id.isin(set(sampled_track_ids))]

[ 488290  983489 1307794 ...  353035  405331 1401222]


In [320]:
def iterative_drop(inter_data,user_threshold,track_threshold):
    
    print('Users {} - Items {}'.format(inter_data.user_id.nunique(),inter_data.track_id.nunique()))
    while True:
        n = len(inter_data)
        # Filter by user
        user_count = inter_data.user_id.value_counts()
        user_count = user_count[user_count>=user_threshold]
        inter_data = inter_data[inter_data.user_id.isin(set(user_count.index))]
        print('Users {} - Items {}'.format(inter_data.user_id.nunique(),inter_data.track_id.nunique()))
        # Filter by track
        item_count = inter_data.track_id.value_counts()
        item_count = item_count[item_count>=track_threshold]
        inter_data = inter_data[inter_data.track_id.isin(set(item_count.index))]
        print('Users {} - Items {}'.format(inter_data.user_id.nunique(),inter_data.track_id.nunique()))
        new_n = len(inter_data)
        
        if new_n == n:
            break
            
        n = len(inter_data)
    
    return inter_data

In [63]:
filtered = iterative_drop(sub_inter,5,5)

Users 22402 - Items 100000
Users 19974 - Items 100000
Users 19974 - Items 99832
Users 19972 - Items 99832
Users 19972 - Items 99831
Users 19972 - Items 99831
Users 19972 - Items 99831


In [64]:
# Creating new indexes
new_user_ids = filtered.user_id.drop_duplicates().reset_index(drop=True).reset_index().rename(columns={'index':'new_user_id'})
new_track_ids = filtered.track_id.drop_duplicates().reset_index(drop=True).reset_index().rename(columns={'index':'new_track_id'})
filtered = filtered.merge(new_user_ids).merge(new_track_ids)
filtered

Unnamed: 0,user_id,track_id,play_count,new_user_id,new_track_id
0,0,23,2,0,0
1,122,23,3,108,0
2,146,23,6,130,0
3,437,23,2,392,0
4,579,23,2,522,0
...,...,...,...,...,...
2830536,20238,1606678,3,17445,99830
2830537,20516,1606678,3,17686,99830
2830538,22340,1606678,2,19211,99830
2830539,23192,1606678,2,19906,99830


In [44]:
new_inter = filtered.drop(columns=['user_id','track_id'])[['new_user_id','new_track_id','play_count']].sort_values('new_user_id')
new_inter

Unnamed: 0,new_user_id,new_track_id,play_count
0,0,0,2
49382,0,55,3
49351,0,54,3
49299,0,53,19
49280,0,52,20
...,...,...,...
2005197,19971,33328,3
404752,19971,2505,2
2455647,19971,58780,2
923317,19971,7334,3


In [45]:
new_demo = demo.reset_index().rename(columns={'index':'user_id'}).merge(new_user_ids).set_index('new_user_id').drop(columns=['user_id'])
new_demo

Unnamed: 0_level_0,user_name,country,age,gender,timestamp
new_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,--DaVeR--,RU,27,m,2007-03-27 19:50:20
1,--Ryo--,IT,33,m,2006-06-18 21:07:33
2,--X--,,-1,m,2007-03-02 13:46:35
3,-2,BR,19,m,2010-01-14 05:55:11
4,-273C,RU,25,m,2007-10-12 18:42:00
...,...,...,...,...,...
19967,zvze,ES,-1,m,2008-10-02 18:06:06
19968,zwerg_verboten,RO,-1,f,2007-11-16 10:20:40
19969,zwzz,,20,f,2012-05-06 07:09:57
19970,zywulec,PL,-1,f,2009-06-09 11:29:49


In [112]:
new_tracks = tracks.reset_index().rename(columns={'index':'track_id'}).merge(new_track_ids).set_index('new_track_id').drop(columns=['track_id'])
new_tracks

Unnamed: 0_level_0,track_artist,track_name
new_track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,The Stone Roses,All For One
1,Avenged Sevenfold,Almost Easy
2,Saliva,Always
3,Hollywood Undead,Been to Hell
4,Guano Apes,Break the Line
...,...,...
99826,King Creosote,Curtain Craft
99827,Mark de Clive-Lowe,The Golden Lady
99828,Willie Colón,Mi Sueño
99829,Superman Is Dead,Sunset Di Tanah Anarki


In [113]:
SAMP_DATA_PATH = '/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/sampled_{}_items_inter.txt'.format(sampled_items)
SAMP_DEMO_PATH = '/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/sampled_{}_items_demo.txt'.format(sampled_items)
SAMP_TRACKS_PATH = '/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/sampled_{}_items_tracks.txt'.format(sampled_items)

In [114]:
# Saving
new_inter.to_csv(SAMP_DATA_PATH,sep='\t',index=False,header=False)
new_demo.to_csv(SAMP_DEMO_PATH,sep='\t',index=False,header=False)
new_tracks.to_csv(SAMP_TRACKS_PATH,sep='\t',index=False,header=False)

### Stats on sampled data

In [265]:
inter = pd.read_csv(SAMP_DATA_PATH, sep='\t', names=['user_id', 'track_id', 'play_count'])
demo = pd.read_csv(SAMP_DEMO_PATH, sep='\t',
                                    names=['user_name', 'country', 'age', 'gender', 'timestamp'])
tracks = pd.read_csv(SAMP_TRACKS_PATH,sep='\t',names=['track_artist','tracks_name'])

In [175]:
new_inter[['new_user_id','new_track_id']].groupby('new_user_id').count().describe().round()

Unnamed: 0,new_track_id
count,19972.0
mean,142.0
std,172.0
min,5.0
25%,29.0
50%,83.0
75%,194.0
max,3025.0


In [190]:
new_inter.groupby('new_user_id').sum()['play_count'].describe().round()

count    19972.0
mean       997.0
std       1571.0
min         10.0
25%        130.0
50%        465.0
75%       1262.0
max      40624.0
Name: play_count, dtype: float64

In [239]:
merged = new_inter.merge(new_demo.reset_index()).merge(new_tracks.reset_index())

In [240]:
merged

Unnamed: 0,new_user_id,new_track_id,play_count,user_name,country,age,gender,timestamp,track_artist,track_name
0,0,0,2,--DaVeR--,RU,27,m,2007-03-27 19:50:20,The Stone Roses,All For One
1,108,0,3,10TonChain,US,31,m,2005-12-10 19:05:06,The Stone Roses,All For One
2,130,0,6,1968,DE,29,m,2005-08-27 16:15:10,The Stone Roses,All For One
3,392,0,2,ARINOZ,UK,-1,f,2008-11-11 03:56:55,The Stone Roses,All For One
4,522,0,2,Acid3P,SK,26,m,2007-01-23 23:26:03,The Stone Roses,All For One
...,...,...,...,...,...,...,...,...,...,...
2830536,17445,99830,3,lanafrommoscow,RU,23,f,2005-07-29 17:30:39,The Shadow Ring,Tiny Creatures
2830537,17686,99830,3,llhp,HR,27,m,2005-05-30 23:42:05,The Shadow Ring,Tiny Creatures
2830538,19211,99830,2,sethtisue,US,40,m,2004-04-11 03:45:20,The Shadow Ring,Tiny Creatures
2830539,19906,99830,2,yourdoom,RU,-1,m,2012-03-24 13:22:28,The Shadow Ring,Tiny Creatures


In [242]:
males =merged[merged.gender=='m']
females =merged[merged.gender=='f']

In [289]:
merged.nunique()

user_id         19972
track_id        99831
play_count        729
user_name       19972
country           163
age               101
gender              2
timestamp       19970
track_artist    40182
tracks_name     86892
dtype: int64

In [290]:
merged.play_count.sum()

19906272

In [295]:
#track-user
merged[['user_id','track_id']].groupby('user_id').count().describe().round()

Unnamed: 0,track_id
count,19972.0
mean,142.0
std,172.0
min,5.0
25%,29.0
50%,83.0
75%,194.0
max,3025.0


In [298]:
#artist-user
merged[['user_id','track_artist']].drop_duplicates().groupby('user_id').count().describe().round()

Unnamed: 0,track_artist
count,19972.0
mean,128.0
std,150.0
min,2.0
25%,28.0
50%,77.0
75%,176.0
max,2599.0


In [301]:
#LE-User
merged[['user_id','play_count']].groupby('user_id').sum().describe().round()

Unnamed: 0,play_count
count,19972.0
mean,997.0
std,1571.0
min,10.0
25%,130.0
50%,465.0
75%,1262.0
max,40624.0


In [303]:
males.nunique()

user_id         15557
track_id        99810
play_count        673
user_name       15557
country           153
age                96
gender              1
timestamp       15557
track_artist    40176
tracks_name     86876
dtype: int64

In [304]:
males.play_count.sum()

16508962

In [305]:
#track-user
males[['user_id','track_id']].groupby('user_id').count().describe().round()

Unnamed: 0,track_id
count,15557.0
mean,153.0
std,182.0
min,5.0
25%,33.0
50%,94.0
75%,211.0
max,3025.0


In [306]:
#artist-user
males[['user_id','track_artist']].drop_duplicates().groupby('user_id').count().describe().round()

Unnamed: 0,track_artist
count,15557.0
mean,138.0
std,158.0
min,2.0
25%,31.0
50%,87.0
75%,191.0
max,2599.0


In [307]:
#LE-User
males[['user_id','play_count']].groupby('user_id').sum().describe().round()

Unnamed: 0,play_count
count,15557.0
mean,1061.0
std,1664.0
min,10.0
25%,142.0
50%,511.0
75%,1349.0
max,40624.0


In [308]:
females.nunique()

user_id          4415
track_id        70980
play_count        423
user_name        4415
country           112
age                67
gender              1
timestamp        4415
track_artist    32414
tracks_name     63184
dtype: int64

In [309]:
females.play_count.sum()

3397310

In [310]:
#track-user
females[['user_id','track_id']].groupby('user_id').count().describe().round()

Unnamed: 0,track_id
count,4415.0
mean,101.0
std,121.0
min,5.0
25%,21.0
50%,56.0
75%,136.0
max,1312.0


In [311]:
#artist-user
females[['user_id','track_artist']].drop_duplicates().groupby('user_id').count().describe().round()

Unnamed: 0,track_artist
count,4415.0
mean,93.0
std,110.0
min,3.0
25%,20.0
50%,53.0
75%,126.0
max,1199.0


In [312]:
#LE-User
females[['user_id','play_count']].groupby('user_id').sum().describe().round()

Unnamed: 0,play_count
count,4415.0
mean,769.0
std,1158.0
min,10.0
25%,93.0
50%,330.0
75%,982.0
max,22718.0
