In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import ndcg_score
import dask.dataframe as dd
from dask.dataframe import from_pandas

In [2]:
df_node= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Erdos_tgbn_2024/data/tgbn-genre_node_labels.csv")
df_node['ts']= pd.to_datetime(df_node['ts'], unit= 's') # timestamp to datetime
df_node['ts'] = pd.to_datetime(df_node['ts'].dt.strftime('%Y-%m-%d')) # dont need time because the timestamp is reset to the beginning of day in order to calculate the frequency vector label
df_node['user_id']=df_node['user_id'].str.slice(5).astype('Int64') #convert user_id to integers
df_node # remember weight is the frequency vector entries, not the actual weight

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
2741930,2009-06-12,651,katy perry,0.004864
2741931,2009-06-12,724,bebop,0.041841
2741932,2009-06-12,724,jazz,0.958159
2741933,2009-06-12,802,Britney Spears,0.359504


#Train-Val-Test Split

In [3]:
np.quantile(range(2741935), [0.7,0.85]) #70-15-15 split of the rows. Would have been more sensible to split the timestamp
# But to follow the paper' result we split like this

array([1919353.8, 2330643.9])

In [4]:
print(df_node['ts'][1919354])
print(df_node['ts'][2330644])

2008-06-27 00:00:00
2008-12-07 00:00:00


In [5]:
train= df_node.loc[df_node['ts']<datetime(2008,6,28)]
val= df_node.loc[(datetime(2008,6,28)<=df_node['ts']) & (df_node['ts']<=datetime(2008,12,7))]
test= df_node.loc[datetime(2008,12,7)<df_node['ts']]

In [6]:
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1920443,2008-06-27,999,juno,0.007919
1920444,2008-06-27,999,Soundtrack,0.007285
1920445,2008-06-27,999,post rock,0.020162
1920446,2008-06-27,999,icelandic,0.018164


# Some data preprocessing

In [7]:
set(df_node['genre'])-set(train['genre'])

{'jazmine sullivan'}

We will augment an auxillary row for processing

In [8]:
row = {'ts':datetime(2008,6,27), 'user_id': 999, 'genre':'jazmine sullivan','weight':0.0}
train = train._append(row, ignore_index = True)
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1920444,2008-06-27,999,Soundtrack,0.007285
1920445,2008-06-27,999,post rock,0.020162
1920446,2008-06-27,999,icelandic,0.018164
1920447,2008-06-27,999,ambient,0.007128


In [9]:
df_node['genre'].nunique()==train['genre'].nunique()

True

# Model :  Rolling average with window size k
We take the average of the last k labels where k is the window size

Firstly we want to know what are the possible window sizes

In [10]:
df_node.groupby(['user_id','genre']).size().describe()

count    122560.000000
mean         22.372185
std          57.450838
min           1.000000
25%           1.000000
50%           5.000000
75%          17.000000
max        1201.000000
dtype: float64

Note that there are entries with only one user genre pair and many as 1201 user genre entries. The average number of entries for a user-genre pair is 22.372185

# Model 4: Window size k=7

In [11]:
train_labels= train.groupby(['user_id','genre']).apply(lambda x: x.tail(7).mean()).reset_index() #note tail=7
train_labels = train_labels.drop('ts', axis=1)
train_labels.rename(columns={'weight':'weight_7'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_7
0,1,80s,0.271609
1,1,Coldplay,0.054871
2,1,Drum and bass,0.094912
3,1,Grunge,0.136364
4,1,Hip-Hop,0.382575
...,...,...,...
96157,1000,swing,0.005391
96158,1000,techno,0.007496
96159,1000,trip hop,0.021676
96160,1000,turntablism,0.011806


In [12]:
train_labels[train_labels.isnull().any(axis=1)] # There are no null entries

Unnamed: 0,user_id,genre,weight_7


In [13]:
# testing if the above code is working. checking the first entry
df_node[(df_node['user_id']==1) & (df_node['genre']=='80s')].tail(7)

Unnamed: 0,ts,user_id,genre,weight
676454,2006-11-20,1,80s,0.333333
678575,2006-11-21,1,80s,0.333333
680554,2006-11-22,1,80s,0.102574
692137,2006-11-28,1,80s,0.354545
702754,2006-12-03,1,80s,0.301158
704915,2006-12-04,1,80s,0.076322
719704,2006-12-11,1,80s,0.4


In [14]:
print(df_node[(df_node['user_id']==1) & (df_node['genre']=='80s')].tail(7)['weight'].mean())
print( "Note that this is the first weight entry in the train_labels column")

0.2716094756428934
Note that this is the first weight entry in the train_labels column


Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [15]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_7"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 841*513=431433

Unnamed: 0,user_id,genre,weight_7
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
431428,1000,westlife,0.0
431429,1000,whitney houston,0.0
431430,1000,world,0.0
431431,1000,wu-tang,0.0


## Inference

Preparing the validation and test sets. We need to make sure that each timestamp, user pair has 513 genre entries. We put 0.0 for entries not in the dataset. 0.0 means no interaction.

In [16]:
val_labels= val.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
val_labels.columns = [*val.columns[:-1], 'weight']
val_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-06-28,2,00s,0.0
1,2008-06-28,2,1970s,0.0
2,2008-06-28,2,1980s,0.0
3,2008-06-28,2,1990s,0.0
4,2008-06-28,2,2000s,0.0
...,...,...,...,...
18845563,2008-12-07,1000,westlife,0.0
18845564,2008-12-07,1000,whitney houston,0.0
18845565,2008-12-07,1000,world,0.0
18845566,2008-12-07,1000,wu-tang,0.0


In [17]:
test_labels= test.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
test_labels.columns = [*test.columns[:-1], 'weight']
test_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-12-08,1,00s,0.0
1,2008-12-08,1,1970s,0.0
2,2008-12-08,1,1980s,0.0
3,2008-12-08,1,1990s,0.0
4,2008-12-08,1,2000s,0.0
...,...,...,...,...
18684994,2009-06-12,802,westlife,0.0
18684995,2009-06-12,802,whitney houston,0.0
18684996,2009-06-12,802,world,0.0
18684997,2009-06-12,802,wu-tang,0.0


In [18]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=10)
val_labels_dask = from_pandas(val_labels, npartitions=10)
test_labels_dask = from_pandas(test_labels, npartitions=10)

In [19]:
# left join val_labels and train_labels on user_id, genre pair
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [20]:
# from dask back to pandas
df_pred_val= df_val.compute()
df_pred_val

Unnamed: 0,ts,user_id,genre,weight,weight_7
0,2008-06-28,2,Avant-Garde,0.0,0.000000
1,2008-06-28,2,Avant-garde Metal,0.0,0.000000
2,2008-06-28,2,Eminem,0.0,0.000000
3,2008-06-28,2,Grime,0.0,0.000000
4,2008-06-28,2,Grunge,0.0,0.010346
...,...,...,...,...,...
1874741,2008-12-07,1000,trumpet,0.0,0.000000
1874742,2008-12-07,1000,upbeat,0.0,0.000000
1874743,2008-12-07,1000,vocal,0.0,0.000000
1874744,2008-12-07,1000,whitney houston,0.0,0.000000


In [21]:
# from dask back to pandas
df_pred_test = df_test.compute()
df_pred_test

Unnamed: 0,ts,user_id,genre,weight,weight_7
0,2008-12-25,681,amazing,0.0,0.032967
1,2008-12-25,681,atmospheric,0.0,0.000000
2,2008-12-25,681,australian,0.0,0.015575
3,2008-12-25,681,beyonce,0.0,0.000000
4,2008-12-25,681,bjm radio,0.0,0.018766
...,...,...,...,...,...
1868421,2009-06-12,802,prda,0.0,0.000000
1868422,2009-06-12,802,psychobilly,0.0,0.053121
1868423,2009-06-12,802,sexy,0.0,0.009157
1868424,2009-06-12,802,swedish,0.0,0.068519


In [22]:
df_pred_val['weight_7'] = df_pred_val['weight_7'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_7'] = df_pred_test['weight_7'].fillna(0.0) # fill null values with 0.0

# NDCG Scores

In [23]:
print("The ndcg score on validation set where prediction is the rolling average (window =7) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_7']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the rolling average (window =7) of observed node labels in training set.
0.17564313942318133


In [24]:
print("The ndcg score on test set where prediction is the rolling average (window =7) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_7']).reshape(-1,513), k=10))

The ndcg score on test set where prediction is the rolling average (window =7) of observed node labels in training set.
0.16267624046773704


# Model 5: Window =14

In [25]:
train_labels= train.groupby(['user_id','genre']).apply(lambda x: x.tail(14).mean()).reset_index() #note tail=14
train_labels = train_labels.drop('ts', axis=1)
train_labels.rename(columns={'weight':'weight_14'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_14
0,1,80s,0.292930
1,1,Coldplay,0.054871
2,1,Drum and bass,0.094912
3,1,Grunge,0.136364
4,1,Hip-Hop,0.317396
...,...,...,...
96157,1000,swing,0.005391
96158,1000,techno,0.007496
96159,1000,trip hop,0.021676
96160,1000,turntablism,0.011806


In [26]:
train_labels[train_labels.isnull().any(axis=1)] # There are no null entries

Unnamed: 0,user_id,genre,weight_14


In [27]:
print(df_node[(df_node['user_id']==1) & (df_node['genre']=='80s')].tail(14)['weight'].mean())
print( "Note that this is the first weight entry in the train_labels column")

0.29292987551570904
Note that this is the first weight entry in the train_labels column


Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [28]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_14"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 841*513=431433

Unnamed: 0,user_id,genre,weight_14
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
431428,1000,westlife,0.0
431429,1000,whitney houston,0.0
431430,1000,world,0.0
431431,1000,wu-tang,0.0


In [29]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=10)

In [30]:
# left join val_labels and train_labels on user_id, genre pair
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [31]:
# from dask back to pandas
df_pred_val= df_val.compute()
df_pred_val

Unnamed: 0,ts,user_id,genre,weight,weight_14
0,2008-06-28,2,Avant-Garde,0.0,0.000000
1,2008-06-28,2,Avant-garde Metal,0.0,0.000000
2,2008-06-28,2,Eminem,0.0,0.000000
3,2008-06-28,2,Grime,0.0,0.000000
4,2008-06-28,2,Grunge,0.0,0.023756
...,...,...,...,...,...
1874741,2008-12-07,1000,trumpet,0.0,0.000000
1874742,2008-12-07,1000,upbeat,0.0,0.000000
1874743,2008-12-07,1000,vocal,0.0,0.000000
1874744,2008-12-07,1000,whitney houston,0.0,0.000000


In [32]:
# from dask back to pandas
df_pred_test = df_test.compute()
df_pred_test

Unnamed: 0,ts,user_id,genre,weight,weight_14
0,2008-12-08,1,3 Doors Down,0.0,0.000000
1,2008-12-08,1,77davez-all-tracks,0.0,0.000000
2,2008-12-08,1,90s,0.0,0.000000
3,2008-12-08,1,Alternative Punk,0.0,0.000000
4,2008-12-08,1,Awesome,0.0,0.000000
...,...,...,...,...,...
1868421,2009-06-12,802,prda,0.0,0.000000
1868422,2009-06-12,802,psychobilly,0.0,0.053121
1868423,2009-06-12,802,sexy,0.0,0.008373
1868424,2009-06-12,802,swedish,0.0,0.060047


In [33]:
df_pred_val['weight_14'] = df_pred_val['weight_14'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_14'] = df_pred_test['weight_14'].fillna(0.0) # fill null values with 0.0

# NDCG Scores

In [34]:
print("The ndcg score on validation set where prediction is the rolling average (window =14) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_14']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the rolling average (window =14) of observed node labels in training set.
0.17976270400071742


In [35]:
print("The ndcg score on test set where prediction is the rolling average (window =14) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_14']).reshape(-1,513), k=10))

The ndcg score on test set where prediction is the rolling average (window =14) of observed node labels in training set.
0.16837764621875076


# Model 6: Window =21

In [36]:
train_labels= train.groupby(['user_id','genre']).apply(lambda x: x.tail(14).mean()).reset_index() #note tail=21
train_labels = train_labels.drop('ts', axis=1)
train_labels.rename(columns={'weight':'weight_21'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_21
0,1,80s,0.292930
1,1,Coldplay,0.054871
2,1,Drum and bass,0.094912
3,1,Grunge,0.136364
4,1,Hip-Hop,0.317396
...,...,...,...
96157,1000,swing,0.005391
96158,1000,techno,0.007496
96159,1000,trip hop,0.021676
96160,1000,turntablism,0.011806


In [37]:
train_labels[train_labels.isnull().any(axis=1)] # There are no null entries

Unnamed: 0,user_id,genre,weight_21


In [38]:
print(df_node[(df_node['user_id']==1) & (df_node['genre']=='80s')].tail(21)['weight'].mean())
print( "Note that this is the first weight entry in the train_labels column")

0.29292987551570904
Note that this is the first weight entry in the train_labels column


Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [39]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_21"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 841*513=431433

Unnamed: 0,user_id,genre,weight_21
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
431428,1000,westlife,0.0
431429,1000,whitney houston,0.0
431430,1000,world,0.0
431431,1000,wu-tang,0.0


In [40]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=10)

In [41]:
# left join val_labels and train_labels on user_id, genre pair
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_pred_val= df_val.compute()
df_pred_test = df_test.compute()

In [42]:
df_pred_val['weight_21'] = df_pred_val['weight_21'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_21'] = df_pred_test['weight_21'].fillna(0.0) # fill null values with 0.0

# NDCG Scores

In [43]:
print("The ndcg score on validation set where prediction is the rolling average (window =21) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_21']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the rolling average (window =21) of observed node labels in training set.
0.17976270400071742


In [44]:
print("The ndcg score on test set where prediction is the rolling average (window =21) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_21']).reshape(-1,513), k=10))

The ndcg score on test set where prediction is the rolling average (window =21) of observed node labels in training set.
0.16818096092718438
