In [336]:
import pandas as pd
import numpy as np

from utils.data_splitter import DataSplitter
from conf import DEMO_PATH,DATA_PATH

In [337]:
inter = pd.read_csv(DATA_PATH, sep='\t', names=['user_id', 'track_id', 'play_count'])
demo = pd.read_csv(DEMO_PATH, sep='\t',
                                    names=['user_name', 'country', 'age', 'gender', 'timestamp'])
demo.index.name = 'user_id'
print('Users {} - Items {}'.format(inter.user_id.nunique(),inter.track_id.nunique()))

Users 19972 - Items 99831


In [338]:
trait = 'gender'
ds = DataSplitter(DATA_PATH, DEMO_PATH)
user_groups = ds.get_user_groups(trait)

print('Statistics for demographic trait <{}>'.format(trait))

max_users = -np.inf
min_users = np.inf
for user_group in user_groups:
    n_users = len(user_group.uids)
    print('{} - {:3d} users'.format(user_group.name, n_users))
    if n_users > max_users:
        max_users = n_users
    if n_users < min_users:
        min_users = n_users


Statistics for demographic trait <gender>
m - 15557 users
f - 4415 users


#### Downsampling

In [339]:
def down_sample(user_group,min_value,inter,demo):
    n_samp = len(user_group.uids) - min_value
    
    assert n_samp > 0, 'Cannot down_sample below the min_value!'

    # Sampling randomly without replacement uids from the specified user_group
    chosen = np.random.choice(user_group.uids,n_samp,replace=False)

    # Dropping from inter
    down_inter =inter[~inter.user_id.isin(chosen)]

    dow = down_inter[['user_id']].drop_duplicates().copy()
    # Assiging a new user_id
    dow['new_user_id'] = range(len(dow))

    down_inter = down_inter.merge(dow).sort_values('new_user_id').drop(columns='user_id').rename(columns={'new_user_id':'user_id'})
    down_inter = down_inter[['user_id']+ list(down_inter.columns[:-1])]

    down_demo = pd.merge(demo.reset_index(),dow).sort_values('new_user_id').drop(columns=['user_id','new_user_id'])

    return down_inter,down_demo

In [130]:
np.random.seed(42)
down_inter,down_demo = down_sample(user_groups[0],min_users,inter,demo)
print('Users {} - Items {}'.format(down_inter.user_id.nunique(),down_inter.track_id.nunique()))

Users 8830 - Items 96896


In [131]:
# Saving
down_inter.to_csv('/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/down_sampled_inter.txt',sep='\t',index=False,header=False)
down_demo.to_csv('/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/down_sampled_demo.txt',sep='\t',index=False,header=False)

### Stats for downsampled data

In [313]:
inter = pd.read_csv('/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/down_sampled_inter.txt', sep='\t', names=['user_id', 'track_id', 'play_count'])
demo = pd.read_csv('/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/down_sampled_demo.txt', sep='\t',
                                    names=['user_name', 'country', 'age', 'gender', 'timestamp'])
tracks = pd.read_csv('/share/cp/datasets/LFM/LFM-2b/IPM/datasets/user_song_regexp_since_2016_pc_gt_1_user_gte_5_song_gte_5/sampled_100000_items_tracks.txt',sep='\t',names=['track_artist','track_name'])

In [314]:
tracks.index.name = 'track_id'
demo.index.name = 'user_id'

In [316]:
merged = inter.merge(tracks.reset_index()).merge(demo.reset_index())

In [319]:
males =merged[merged.gender=='m']
females =merged[merged.gender=='f']

In [320]:
merged.nunique()

user_id          8830
track_id        96896
play_count        556
track_artist    39440
track_name      84519
user_name        8830
country           133
age                83
gender              2
timestamp        8830
dtype: int64

In [321]:
merged.play_count.sum()

8222476

In [322]:
#track-user
merged[['user_id','track_id']].groupby('user_id').count().describe().round()

Unnamed: 0,track_id
count,8830.0
mean,128.0
std,162.0
min,5.0
25%,26.0
50%,72.0
75%,175.0
max,3025.0


In [323]:
#artist-user
merged[['user_id','track_artist']].drop_duplicates().groupby('user_id').count().describe().round()

Unnamed: 0,track_artist
count,8830.0
mean,117.0
std,142.0
min,3.0
25%,25.0
50%,68.0
75%,160.0
max,2599.0


In [324]:
#LE-User
merged[['user_id','play_count']].groupby('user_id').sum().describe().round()

Unnamed: 0,play_count
count,8830.0
mean,931.0
std,1569.0
min,10.0
25%,114.0
50%,415.0
75%,1165.0
max,40624.0


In [325]:
females.nunique()

user_id          4415
track_id        70980
play_count        423
track_artist    32414
track_name      63184
user_name        4415
country           112
age                67
gender              1
timestamp        4415
dtype: int64

In [326]:
females.play_count.sum()

3397310

In [327]:
#track-user
females[['user_id','track_id']].groupby('user_id').count().describe().round()

Unnamed: 0,track_id
count,4415.0
mean,101.0
std,121.0
min,5.0
25%,21.0
50%,56.0
75%,136.0
max,1312.0


In [328]:
#artist-user
females[['user_id','track_artist']].drop_duplicates().groupby('user_id').count().describe().round()

Unnamed: 0,track_artist
count,4415.0
mean,93.0
std,110.0
min,3.0
25%,20.0
50%,53.0
75%,126.0
max,1199.0


In [330]:
#LE-User
females[['user_id','play_count']].groupby('user_id').sum().describe().round()

Unnamed: 0,play_count
count,4415.0
mean,769.0
std,1158.0
min,10.0
25%,93.0
50%,330.0
75%,982.0
max,22718.0


In [331]:
males.nunique()

user_id          4415
track_id        91013
play_count        461
track_artist    37907
track_name      79724
user_name        4415
country           105
age                68
gender              1
timestamp        4415
dtype: int64

In [332]:
males.play_count.sum()

4825166

In [333]:
#track-user
males[['user_id','track_id']].groupby('user_id').count().describe().round()

Unnamed: 0,track_id
count,4415.0
mean,156.0
std,190.0
min,5.0
25%,33.0
50%,96.0
75%,214.0
max,3025.0


In [334]:
#artist-user
males[['user_id','track_artist']].drop_duplicates().groupby('user_id').count().describe().round()

Unnamed: 0,track_artist
count,4415.0
mean,141.0
std,164.0
min,3.0
25%,31.0
50%,87.0
75%,194.0
max,2599.0


In [335]:
#LE-User
males[['user_id','play_count']].groupby('user_id').sum().describe().round()

Unnamed: 0,play_count
count,4415.0
mean,1093.0
std,1879.0
min,10.0
25%,140.0
50%,525.0
75%,1344.0
max,40624.0
