In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import ndcg_score
import dask.dataframe as dd
from dask.dataframe import from_pandas

In [2]:
df_node= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Erdos_tgbn_2024/data/tgbn-genre_node_labels.csv")
df_node['ts']= pd.to_datetime(df_node['ts'], unit= 's') # timestamp to datetime
df_node['ts'] = pd.to_datetime(df_node['ts'].dt.strftime('%Y-%m-%d')) # dont need time because the timestamp is reset to the beginning of day in order to calculate the frequency vector label
df_node['user_id']=df_node['user_id'].str.slice(5).astype('Int64') #convert user_id to integers
df_node # remember weight is the frequency vector entries, not the actual weight

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
2741930,2009-06-12,651,katy perry,0.004864
2741931,2009-06-12,724,bebop,0.041841
2741932,2009-06-12,724,jazz,0.958159
2741933,2009-06-12,802,Britney Spears,0.359504


In [3]:
df_node.user_id.nunique()

974

In [4]:
df_node.genre.nunique()

513

# Train Validation Test Split

In [5]:
np.quantile(range(2741935), [0.7,0.85]) #70-15-15 split of the rows. Would have been more sensible to split the timestamp
# But to follow the paper' result we split like this

array([1919353.8, 2330643.9])

In [6]:
print(df_node['ts'][1919354])
print(df_node['ts'][2330644])

2008-06-27 00:00:00
2008-12-07 00:00:00


In [7]:
train= df_node.loc[df_node['ts']<datetime(2008,6,28)]
val= df_node.loc[(datetime(2008,6,28)<=df_node['ts']) & (df_node['ts']<=datetime(2008,12,7))]
test= df_node.loc[datetime(2008,12,7)<df_node['ts']]

In [8]:
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1920443,2008-06-27,999,juno,0.007919
1920444,2008-06-27,999,Soundtrack,0.007285
1920445,2008-06-27,999,post rock,0.020162
1920446,2008-06-27,999,icelandic,0.018164


In [9]:
print("The number of unique users in training set is {}".format(train['user_id'].nunique()))
print("The number of unique users in validation set is {}".format(val['user_id'].nunique()))
print("The number of unique users in test set is {}".format(test['user_id'].nunique()))

The number of unique users in training set is 841
The number of unique users in validation set is 556
The number of unique users in test set is 579


In [10]:
print("The validation users are in the training set: {}".format(set(val['user_id']).issubset(set(train['user_id']))))
print("The test users are in the training set: {}".format(set(test['user_id']).issubset(set(train['user_id']))))
# this is a problem

The validation users are in the training set: False
The test users are in the training set: False


In [11]:
print("The number of users in validation set but not in the training set is {}".format(len(set(val['user_id'])-set(train['user_id']))))
print("The number of users in test set but not in the training set is {}".format(len(set(test['user_id'])-set(train['user_id']))))
print("The number of users in test set but not in the validation set is {}".format(len(set(test['user_id'])-set(val['user_id']))))

The number of users in validation set but not in the training set is 68
The number of users in test set but not in the training set is 127
The number of users in test set but not in the validation set is 90


# Some data preprocessing

In [12]:
print(train['genre'].nunique())
print(df_node['genre'].nunique())
# This is a problem

512
513


In [13]:
set(df_node['genre'])-set(train['genre'])

{'jazmine sullivan'}

We will augment an auxillary row for processing

In [14]:
row = {'ts':datetime(2008,6,27), 'user_id': 999, 'genre':'jazmine sullivan','weight':0.0}
train = train._append(row, ignore_index = True)
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1920444,2008-06-27,999,Soundtrack,0.007285
1920445,2008-06-27,999,post rock,0.020162
1920446,2008-06-27,999,icelandic,0.018164
1920447,2008-06-27,999,ambient,0.007128


In [15]:
df_node['genre'].nunique()==train['genre'].nunique()

True

# Model 1:  Baseline prediction
We use the last seen user label vector in the training set to predict the validation and test labels

In [16]:
train_labels= train.groupby(['user_id','genre']).agg({'weight': 'last'}).reset_index() #last will give the latest weight
train_labels.rename(columns={'weight':'weight_latest'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_latest
0,1,80s,0.400000
1,1,Coldplay,0.081038
2,1,Drum and bass,0.050248
3,1,Grunge,0.136364
4,1,Hip-Hop,0.112609
...,...,...,...
96157,1000,swing,0.005391
96158,1000,techno,0.007496
96159,1000,trip hop,0.023990
96160,1000,turntablism,0.011806


In [17]:
train_labels_mean= train.groupby(['user_id','genre']).agg({'weight': 'mean'}).reset_index() #last will give the latest weight
train_labels_median= train.groupby(['user_id','genre']).agg({'weight': 'median'}).reset_index()
train_labels_mean.rename(columns={'weight':'weight_mean'}, inplace=True)
train_labels_median.rename(columns={'weight':'weight_median'}, inplace=True)
train_labels['weight_mean']= train_labels_mean['weight_mean']
train_labels['weight_median']= train_labels_median['weight_median']
train_labels

Unnamed: 0,user_id,genre,weight_latest,weight_mean,weight_median
0,1,80s,0.400000,0.292930,0.333333
1,1,Coldplay,0.081038,0.054871,0.052144
2,1,Drum and bass,0.050248,0.094912,0.066023
3,1,Grunge,0.136364,0.136364,0.136364
4,1,Hip-Hop,0.112609,0.317396,0.112609
...,...,...,...,...,...
96157,1000,swing,0.005391,0.005391,0.005391
96158,1000,techno,0.007496,0.007496,0.007496
96159,1000,trip hop,0.023990,0.021676,0.021676
96160,1000,turntablism,0.011806,0.011806,0.011806


# Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [18]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_latest","weight_mean", "weight_median"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 841*512=431433

Unnamed: 0,user_id,genre,weight_latest,weight_mean,weight_median
0,1,00s,0.0,0.0,0.0
1,1,1970s,0.0,0.0,0.0
2,1,1980s,0.0,0.0,0.0
3,1,1990s,0.0,0.0,0.0
4,1,2000s,0.0,0.0,0.0
...,...,...,...,...,...
431428,1000,westlife,0.0,0.0,0.0
431429,1000,whitney houston,0.0,0.0,0.0
431430,1000,world,0.0,0.0,0.0
431431,1000,wu-tang,0.0,0.0,0.0


#Model 1: Baseline
Using last seen label in the training set to make prediction on test set

Preparing the validation and test sets. We need to make sure that each timestamp, user pair has 513 genre entries. We put 0.0 for entries not in the dataset. 0.0 means no interaction.

In [19]:
val_labels= val.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
val_labels.columns = [*val.columns[:-1], 'weight']
val_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-06-28,2,00s,0.0
1,2008-06-28,2,1970s,0.0
2,2008-06-28,2,1980s,0.0
3,2008-06-28,2,1990s,0.0
4,2008-06-28,2,2000s,0.0
...,...,...,...,...
18845563,2008-12-07,1000,westlife,0.0
18845564,2008-12-07,1000,whitney houston,0.0
18845565,2008-12-07,1000,world,0.0
18845566,2008-12-07,1000,wu-tang,0.0


In [20]:
test_labels= test.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
test_labels.columns = [*test.columns[:-1], 'weight']
test_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-12-08,1,00s,0.0
1,2008-12-08,1,1970s,0.0
2,2008-12-08,1,1980s,0.0
3,2008-12-08,1,1990s,0.0
4,2008-12-08,1,2000s,0.0
...,...,...,...,...
18684994,2009-06-12,802,westlife,0.0
18684995,2009-06-12,802,whitney houston,0.0
18684996,2009-06-12,802,world,0.0
18684997,2009-06-12,802,wu-tang,0.0


In [21]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=10)
val_labels_dask = from_pandas(val_labels, npartitions=10)
test_labels_dask = from_pandas(test_labels, npartitions=10)

In [22]:
# left join val_labels and train_labels on user_id, genre pair 
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [23]:
# from dask back to pandas
df_pred_val= df_val.compute()
df_pred_val

Unnamed: 0,ts,user_id,genre,weight,weight_latest,weight_mean,weight_median
0,2008-06-28,2,Avant-Garde,0.0,0.000000,0.000000,0.000000
1,2008-06-28,2,Avant-garde Metal,0.0,0.000000,0.000000,0.000000
2,2008-06-28,2,Eminem,0.0,0.000000,0.000000,0.000000
3,2008-06-28,2,Grime,0.0,0.000000,0.000000,0.000000
4,2008-06-28,2,Grunge,0.0,0.002218,0.023756,0.005706
...,...,...,...,...,...,...,...
1874741,2008-12-07,1000,trumpet,0.0,0.000000,0.000000,0.000000
1874742,2008-12-07,1000,upbeat,0.0,0.000000,0.000000,0.000000
1874743,2008-12-07,1000,vocal,0.0,0.000000,0.000000,0.000000
1874744,2008-12-07,1000,whitney houston,0.0,0.000000,0.000000,0.000000


In [24]:
# from dask back to pandas
df_pred_test = df_test.compute()
df_pred_test

Unnamed: 0,ts,user_id,genre,weight,weight_latest,weight_mean,weight_median
0,2008-12-25,681,amazing,0.0,0.032967,0.032967,0.032967
1,2008-12-25,681,atmospheric,0.0,0.000000,0.000000,0.000000
2,2008-12-25,681,australian,0.0,0.005625,0.015575,0.015450
3,2008-12-25,681,beyonce,0.0,0.000000,0.000000,0.000000
4,2008-12-25,681,bjm radio,0.0,0.002825,0.018766,0.013160
...,...,...,...,...,...,...,...
1868421,2009-06-12,802,prda,0.0,0.000000,0.000000,0.000000
1868422,2009-06-12,802,psychobilly,0.0,0.053121,0.053121,0.053121
1868423,2009-06-12,802,sexy,0.0,0.033983,0.008373,0.005677
1868424,2009-06-12,802,swedish,0.0,0.026089,0.045749,0.022063


In [25]:
df_pred_val['weight_latest'] = df_pred_val['weight_latest'].fillna(0.0) # fill null values with 0.0
df_pred_val['weight_mean'] = df_pred_val['weight_mean'].fillna(0.0) # fill null values with 0.0
df_pred_val['weight_median'] = df_pred_val['weight_median'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_latest'] = df_pred_test['weight_latest'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_mean'] = df_pred_test['weight_mean'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_median'] = df_pred_test['weight_median'].fillna(0.0) # fill null values with 0.0

# NDCG Scores

Baseline: Latest node lable

In [26]:
print("The ndcg score on validation set where prediction is the latest observed node label in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_latest']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the latest observed node label in training set.
0.13992136544825043


In [27]:
print("The ndcg score on test set where prediction is the latest observed node label in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_latest']).reshape(-1,513), k=10))

The ndcg score where prediction is the latest observed node label in training set.
0.13143477238735699


Model 2: Mean node label

In [28]:
print("The ndcg score on validation set where prediction is mean observed node label in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_mean']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is mean observed node label in training set.
0.17574126985144523


In [29]:
print("The ndcg score where prediction is mean of the observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_mean']).reshape(-1,513), k=10))

The ndcg score where prediction is mean of the observed node labels in training set.
0.1674474915028928


Model 3: Median node label

In [30]:
print("The ndcg score on validation set where prediction is median observed node label in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_median']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is median observed node label in training set.
0.16354703818554217


In [31]:
print("The ndcg score where prediction is the median of the observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_median']).reshape(-1,513), k=10))

The ndcg score where prediction is the median of the observed node labels in training set.
0.15654098634367597
