In [1]:
import time


import numpy as np
import os
import pandas as pd # noqa
import dask.dataframe as dd # noqa
from dask.diagnostics import ProgressBar 
from pathlib import Path

In [2]:
out_folder = Path("./data/apc")

out_folder.mkdir(exist_ok= True)


In [3]:
cols = ['user', 'artist', 'album', 'track', 'timestamp']
full_le = dd.read_csv('data/LFM-1b/LFM-1b_LEs.txt', sep="\t", names = cols)

In [4]:
full_le.head()

Unnamed: 0,user,artist,album,track,timestamp
0,31435741,2,4,4,1385212958
1,31435741,2,4,4,1385212642
2,31435741,2,4,4,1385212325
3,31435741,2,4,4,1385209508
4,31435741,2,4,4,1385209191


Calculating Artist Play Counts

In [None]:
artist_play_counts = full_le.groupby("artist").count()
with ProgressBar():
    artist_play_counts_computed = artist_play_counts.compute()

[#                                       ] | 2% Completed |  7.2s


KeyboardInterrupt: 

In [None]:
artist_play_counts_computed.to_csv(out_folder / Path("artist_apc.csv"))

Calculate APC per User


In [12]:
artist_play_counts_per_user = full_le.groupby(["artist", "user"]).count()
with ProgressBar():
    artist_play_counts_per_user_computed = artist_play_counts_per_user.compute()

[########################################] | 100% Completed |  5min 55.4s


In [19]:
artist_play_counts_per_user_computed["album"].to_csv(out_folder / Path("artist_apc_u.csv"))

In [20]:
artist_play_counts_per_user_computed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,album,track,timestamp
artist,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2115525,41,41,41
1,3256665,1,1,1
1,4812967,391,391,391
1,5855962,2,2,2
1,7438644,29,29,29


In [18]:
artist_play_counts_per_user_computed["album"].head()

artist  user   
1       2115525     41
        3256665      1
        4812967    391
        5855962      2
        7438644     29
Name: album, dtype: int64

## Calculating G_R_APC

In [58]:
artist_play_counts_per_user_computed = dd.read_csv(out_folder / Path("artist_apc_u.csv"))
artist_play_counts_computed = dd.read_csv(out_folder / Path("artist_apc.csv"))

In [60]:
print(artist_play_counts_per_user_computed.columns)
artist_play_counts_per_user_computed.head()

Index(['artist', 'user', 'album'], dtype='object')


Unnamed: 0,artist,user,album
0,1,2115525,41
1,1,3256665,1
2,1,4812967,391
3,1,5855962,2
4,1,7438644,29


In [10]:
artist_play_counts_computed.head()

Unnamed: 0,artist,user,album,track,timestamp
0,1,1151161,1151161,1151161,1151161
1,2,91452,91452,91452,91452
2,3,1444213,1444213,1444213,1444213
3,4,832547,832547,832547,832547
4,5,36469,36469,36469,36469


Removing unsused columns & rename columns

In [62]:
artist_play_counts_computed = artist_play_counts_computed.rename(columns= {"timestamp" : "apc"}).drop(columns= ["album", "track", "user"])
artist_play_counts_computed.head()

Unnamed: 0,artist,apc
0,1,1151161
1,2,91452
2,3,1444213
3,4,832547
4,5,36469


In [61]:
artist_play_counts_per_user_computed = artist_play_counts_per_user_computed.rename(columns = {"album" : "apc_u"})
artist_play_counts_per_user_computed.head()

Unnamed: 0,artist,user,apc_u
0,1,2115525,41
1,1,3256665,1
2,1,4812967,391
3,1,5855962,2
4,1,7438644,29


Calculate Rank

In [63]:
artist_play_counts_ordered = artist_play_counts_computed.sort_values("apc", ascending=False)

with ProgressBar():
    artist_play_counts_ordered = artist_play_counts_ordered.compute()

[########################################] | 100% Completed |  2.2s


In [64]:
artist_play_counts_ranked = artist_play_counts_ordered.reset_index(drop=True).reset_index().rename(columns = {"index" : "rank"})
artist_play_counts_ranked["rank"] +=1

artist_play_counts_ranked.head()

Unnamed: 0,rank,artist,apc
0,1,368,3838611
1,2,1602,3437327
2,3,54,2990318
3,4,46,2576390
4,5,320,2523539


In [75]:

artist_play_counts_per_user_computed = artist_play_counts_per_user_computed.reset_index(drop = True)
artist_play_counts_per_user_computed = artist_play_counts_per_user_computed.categorize(columns= ["user"])


In [76]:
with ProgressBar():
    artist_play_counts_per_user_pivo = artist_play_counts_per_user_computed.pivot_table(columns = "user", values= ["apc_u"], index = "artist")

In [88]:
with ProgressBar():
    artist_play_counts_per_user_pivo.compute()

[####                                    ] | 10% Completed |  6.2s


MemoryError: Unable to allocate 167. GiB for an array with shape (44870359918,) and data type int32

In [118]:
# foreach user --> compute artis rank
# compare with given artis rank
with ProgressBar():
    artis_play_counts_user_x = artist_play_counts_per_user_computed[artist_play_counts_per_user_computed["user"] == 17102068]

In [119]:
with ProgressBar():
    artis_play_counts_user_x = artis_play_counts_user_x.compute()

[########################################] | 100% Completed |  8.7s


In [120]:
artis_play_counts_user_x

Unnamed: 0,artist,user,apc_u
1027585,8,17102068,593
1027642,15,17102068,242
1028028,59,17102068,376
1028088,69,17102068,267
1028230,92,17102068,124
...,...,...,...
1074422,467330,17102068,2
1074815,513362,17102068,1
1075074,562130,17102068,2
1075075,562134,17102068,2


In [121]:
with ProgressBar():
    artis_play_counts_user_x_sorted = artis_play_counts_user_x.sort_values("apc_u", ascending= False)

In [122]:
#artis_play_counts_user_x_sorted = artis_play_counts_user_x_sorted.drop(columns= ["rank"])
artis_play_counts_user_x_sorted = artis_play_counts_user_x_sorted.reset_index(drop=True).reset_index().rename(columns = {"index" : "rank"})
artis_play_counts_user_x_sorted["rank"] += 1
artis_play_counts_user_x_sorted = artis_play_counts_user_x_sorted.sort_values("artist", ascending= False)
artis_play_counts_user_x_sorted.head()


Unnamed: 0,rank,artist,user,apc_u
66,67,619010,17102068,2
46,47,562134,17102068,2
53,54,562130,17102068,2
72,73,513362,17102068,1
47,48,467330,17102068,2


In [123]:
artist_play_counts_ranked = artist_play_counts_ranked.sort_values("artist", ascending= False)

artist_play_counts_ranked.head()

Unnamed: 0,rank,artist,apc
3123495,3123496,3190370,1
2159518,2159519,3190369,1
2028640,2028641,3190368,2
2159345,2159346,3190367,1
2158805,2158806,3190366,1


In [127]:
import numpy as np

def normalised_kendall_tau_distance(values1, values2):
    """Compute the Kendall tau distance."""
    n = len(values1)
    assert len(values2) == n, "Both lists have to be of equal length"
    i, j = np.meshgrid(np.arange(n), np.arange(n))
    a = np.argsort(values1)
    b = np.argsort(values2)
    ndisordered = np.logical_or(np.logical_and(a[i] < a[j], b[i] > b[j]), np.logical_and(a[i] > a[j], b[i] < b[j])).sum()
    return ndisordered / (n * (n - 1))

In [128]:
with ProgressBar():
    print(artist_play_counts_ranked["apc"].corr(artis_play_counts_user_x_sorted["apc_u"], method = "kendall"))
    print(normalised_kendall_tau_distance(artist_play_counts_ranked["apc"],artis_play_counts_user_x_sorted["apc_u"]))

0.9247873943761031


AssertionError: Both lists have to be of equal length

In [143]:
artist_joined = artist_play_counts_ranked.join(artis_play_counts_user_x_sorted.loc[:,["artist", "apc_u"]], on = "artist", how = "left", rsuffix = "_right")
artist_joined = artist_joined.drop(columns = "artist_right")
artist_joined = artist_joined.fillna(0)

In [157]:
print(artist_joined[artist_joined["artist"] == 619010])


    rank_u    rank  artist  apc  apc_u
51      52  218603  619010  132    2.0


In [158]:
print(artis_play_counts_user_x_sorted[artis_play_counts_user_x_sorted["artist"] == 619010])

    rank  artist      user  apc_u
66    67  619010  17102068      2


In [154]:
artist_joined = artist_play_counts_ranked.merge(artis_play_counts_user_x_sorted.loc[:,["artist", "apc_u"]], left_on = "artist", right_on = "artist", how = "left")
artist_joined = artist_joined.fillna(0)

In [155]:
artist_joined.head()

Unnamed: 0,rank,artist,apc,apc_u
0,3123496,3190370,1,0.0
1,2159519,3190369,1,0.0
2,2028641,3190368,2,0.0
3,2159346,3190367,1,0.0
4,2158806,3190366,1,0.0


In [156]:
artist_joined = artist_joined.sort_values("apc_u", ascending= False).reset_index(drop= True).reset_index()
artist_joined = artist_joined.rename(columns = {"index" : "rank_u"})
artist_joined["rank_u"] += 1
artist_joined.head(50)

Unnamed: 0,rank_u,rank,artist,apc,apc_u
0,1,1099,9058,154568,965.0
1,2,1346,8945,127921,782.0
2,3,348,6122,374287,663.0
3,4,38,171,1339469,650.0
4,5,70,105,1025936,622.0
5,6,115,8,763135,593.0
6,7,1623,4011,106298,483.0
7,8,15,283,1713020,400.0
8,9,378,6779,355917,390.0
9,10,154,59,660235,376.0


In [89]:
artist_play_counts_per_user_computed.head()

Unnamed: 0,artist,user,apc_u
0,1,2115525,41
1,1,3256665,1
2,1,4812967,391
3,1,5855962,2
4,1,7438644,29


In [97]:
with ProgressBar():
    artist_play_counts_per_user_grouped = artist_play_counts_per_user_computed.sort_values("apc_u", ascending= False).groupby("user")


[########################################] | 100% Completed | 17.2s


In [98]:
artist_play_counts_per_user_grouped.head()

AttributeError: 'Column not found: head'

In [94]:
with ProgressBar():
    artist_play_counts_per_user_grouped = artist_play_counts_per_user_grouped.compute()

AttributeError: 'Column not found: compute'

In [159]:
artist_joined.head()

Unnamed: 0,rank_u,rank,artist,apc,apc_u
0,1,1099,9058,154568,965.0
1,2,1346,8945,127921,782.0
2,3,348,6122,374287,663.0
3,4,38,171,1339469,650.0
4,5,70,105,1025936,622.0


In [167]:
artis_play_counts_user_x_sorted.shape

Unnamed: 0,rank,artist,user,apc_u
66,67,619010,17102068,2
46,47,562134,17102068,2
53,54,562130,17102068,2
72,73,513362,17102068,1
47,48,467330,17102068,2
...,...,...,...,...
18,19,92,17102068,124
11,12,69,17102068,267
9,10,59,17102068,376
12,13,15,17102068,242


In [168]:
artist_joined["rank_u2"] = artist_joined["rank_u"].copy()

In [171]:
artist_joined.loc[artist_joined["rank_u2"] > 92 ,"rank_u2"] = 93

In [174]:
x = artist_joined["apc"].map_partitions(pd.rank)
x.head()

AttributeError: 'Series' object has no attribute 'map_partitions'

In [173]:
artist_joined["rank"].corr(artist_joined["rank_u2"], method = "kendall")

0.007161075381418699

In [181]:
#artist_joined_df = pd.DataFrame(artist_joined.compute())
artist_joined["artist_rank_new"] = artist_joined["apc"].rank(ascending = False)

In [182]:
artist_joined["artist_u_rank_new"] = artist_joined["apc_u"].rank(ascending = False)

In [183]:
artist_joined.head()

Unnamed: 0,rank_u,rank,artist,apc,apc_u,rank_u2,artist_rank_new,artist_u_rank_new
0,1,1099,9058,154568,965.0,1,1099.0,1.0
1,2,1346,8945,127921,782.0,2,1346.0,2.0
2,3,348,6122,374287,663.0,3,348.0,3.0
3,4,38,171,1339469,650.0,4,38.0,4.0
4,5,70,105,1025936,622.0,5,70.0,5.0


In [188]:
artist_joined["artist_u_rank_new"].corr(artist_joined["artist_rank_new"], method = "spearman")

0.008916231794730375

In [189]:
artist_joined_only_user = artist_joined[artist_joined["apc_u"] != 0.0]

In [190]:
artist_joined_only_user["artist_u_rank_new"].corr(artist_joined_only_user["artist_rank_new"], method = "kendall")


0.6327780908886321

In [191]:
artist_joined_only_user["rank_u"].corr(artist_joined_only_user["rank"], method = "kendall")

0.595795508838987

In [179]:
#print(normalised_kendall_tau_distance(artist_joined["apc"],artist_joined["apc_u"]))
print(normalised_kendall_tau_distance(artist_joined["artist_rank_new"],artist_joined["artist_u_rank_new"]))

MemoryError: Unable to allocate 35.5 TiB for an array with shape (3123496, 3123496) and data type int32