In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import ndcg_score
import dask.dataframe as dd
from dask.dataframe import from_pandas

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df_node= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Erdos_tgbn_2024/data/tgbn-genre_node_labels.csv")
df_node['ts']= pd.to_datetime(df_node['ts'], unit= 's') # timestamp to datetime
df_node['ts'] = pd.to_datetime(df_node['ts'].dt.strftime('%Y-%m-%d')) # dont need time because the timestamp is reset to the beginning of day in order to calculate the frequency vector label
df_node['user_id']=df_node['user_id'].str.slice(5).astype('Int64') #convert user_id to integers
df_node # remember weight is the frequency vector entries, not the actual weight

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
2741930,2009-06-12,651,katy perry,0.004864
2741931,2009-06-12,724,bebop,0.041841
2741932,2009-06-12,724,jazz,0.958159
2741933,2009-06-12,802,Britney Spears,0.359504


In [4]:
df_node.user_id.nunique()

974

In [5]:
df_node.genre.nunique()

513

# Train Validation Test Split

In [6]:
np.quantile(df_node['ts'].unique(), [0.7,0.85]) #70-15-15 split of the timestamps

array(['2008-02-24T14:23:59.999999993', '2008-10-18T07:11:59.999999996'],
      dtype='datetime64[ns]')

In [7]:
train= df_node.loc[df_node['ts']<datetime(2008,2,25)]
val= df_node.loc[(datetime(2008,2,25)<=df_node['ts']) & (df_node['ts']<=datetime(2008,10,18))]
test= df_node.loc[datetime(2008,10,18)<df_node['ts']]

In [8]:
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1638965,2008-02-24,995,pop,0.017661
1638966,2008-02-24,995,acoustic,0.146607
1638967,2008-02-24,995,soul,0.088105
1638968,2008-02-24,995,jazz,0.050220


In [9]:
print("The number of unique users in training set is {}".format(train['user_id'].nunique()))
print("The number of unique users in validation set is {}".format(val['user_id'].nunique()))
print("The number of unique users in test set is {}".format(test['user_id'].nunique()))

The number of unique users in training set is 787
The number of unique users in validation set is 585
The number of unique users in test set is 600


In [10]:
print("The validation users are in the training set: {}".format(set(val['user_id']).issubset(set(train['user_id']))))
print("The test users are in the training set: {}".format(set(test['user_id']).issubset(set(train['user_id']))))
# this is a problem

The validation users are in the training set: False
The test users are in the training set: False


In [11]:
print("The number of users in validation set but not in the training set is {}".format(len(set(val['user_id'])-set(train['user_id']))))
print("The number of users in test set but not in the training set is {}".format(len(set(test['user_id'])-set(train['user_id']))))
print("The number of users in test set but not in the validation set is {}".format(len(set(test['user_id'])-set(val['user_id']))))

The number of users in validation set but not in the training set is 103
The number of users in test set but not in the training set is 176
The number of users in test set but not in the validation set is 107


# Some data preprocessing

In [12]:
print(train['genre'].nunique())
print(df_node['genre'].nunique())
# This is a problem

512
513


In [13]:
set(df_node['genre'])-set(train['genre'])

{'jazmine sullivan'}

We will augment an auxillary row for processing

In [14]:
row = {'ts':datetime(2008,2,24), 'user_id': 995, 'genre':'jazmine sullivan','weight':0.0}
train = train._append(row, ignore_index = True)
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1638966,2008-02-24,995,acoustic,0.146607
1638967,2008-02-24,995,soul,0.088105
1638968,2008-02-24,995,jazz,0.050220
1638969,2008-02-24,995,female vocalist,0.044934


In [15]:
df_node['genre'].nunique()==train['genre'].nunique()

True

# Model 1:  Baseline prediction
We use the last seen user label vector in the training set to predict the validation and test labels

In [16]:
train_labels= train.groupby(['user_id','genre']).agg({'weight': 'last'}).reset_index() #last will give the latest weight
train_labels.rename(columns={'weight':'weight_latest'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_latest
0,1,80s,0.400000
1,1,Coldplay,0.040319
2,1,Drum and bass,0.050248
3,1,Grunge,0.136364
4,1,Lo-Fi,0.069268
...,...,...,...
86661,1000,soul,0.259816
86662,1000,swing,0.005391
86663,1000,trip hop,0.019362
86664,1000,turntablism,0.011806


In [17]:
train_labels_mean= train.groupby(['user_id','genre']).agg({'weight': 'mean'}).reset_index() #last will give the latest weight
train_labels_median= train.groupby(['user_id','genre']).agg({'weight': 'median'}).reset_index()
train_labels_mean.rename(columns={'weight':'weight_mean'}, inplace=True)
train_labels_median.rename(columns={'weight':'weight_median'}, inplace=True)
train_labels['weight_mean']= train_labels_mean['weight_mean']
train_labels['weight_median']= train_labels_median['weight_median']
train_labels

Unnamed: 0,user_id,genre,weight_latest,weight_mean,weight_median
0,1,80s,0.400000,0.292930,0.333333
1,1,Coldplay,0.040319,0.055873,0.052144
2,1,Drum and bass,0.050248,0.094912,0.066023
3,1,Grunge,0.136364,0.136364,0.136364
4,1,Lo-Fi,0.069268,0.069268,0.069268
...,...,...,...,...,...
86661,1000,soul,0.259816,0.200897,0.200897
86662,1000,swing,0.005391,0.005391,0.005391
86663,1000,trip hop,0.019362,0.019362,0.019362
86664,1000,turntablism,0.011806,0.011806,0.011806


In [18]:
#normalize all the weights. note that for each user they have to sum up to 1
train_labels['weight_latest']= train_labels['weight_latest'].div(train_labels.groupby(['user_id'])['weight_latest'].transform('sum'))
train_labels['weight_mean']= train_labels['weight_mean'].div(train_labels.groupby(['user_id'])['weight_mean'].transform('sum'))
train_labels['weight_median']= train_labels['weight_median'].div(train_labels.groupby(['user_id'])['weight_median'].transform('sum'))

# Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [19]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_latest","weight_mean", "weight_median"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 747*513= 403731

Unnamed: 0,user_id,genre,weight_latest,weight_mean,weight_median
0,1,00s,0.0,0.0,0.0
1,1,1970s,0.0,0.0,0.0
2,1,1980s,0.0,0.0,0.0
3,1,1990s,0.0,0.0,0.0
4,1,2000s,0.0,0.0,0.0
...,...,...,...,...,...
403726,1000,westlife,0.0,0.0,0.0
403727,1000,whitney houston,0.0,0.0,0.0
403728,1000,world,0.0,0.0,0.0
403729,1000,wu-tang,0.0,0.0,0.0


#Model 1: Baseline
Using last seen label in the training set to make prediction on test set

Preparing the validation and test sets. We need to make sure that each timestamp, user pair has 513 genre entries. We put 0.0 for entries not in the dataset. 0.0 means no interaction.

In [20]:
val_labels= val.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
val_labels.columns = [*val.columns[:-1], 'weight']
val_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-02-25,2,00s,0.0
1,2008-02-25,2,1970s,0.0
2,2008-02-25,2,1980s,0.0
3,2008-02-25,2,1990s,0.0
4,2008-02-25,2,2000s,0.0
...,...,...,...,...
26127598,2008-10-18,1000,westlife,0.0
26127599,2008-10-18,1000,whitney houston,0.0
26127600,2008-10-18,1000,world,0.0
26127601,2008-10-18,1000,wu-tang,0.0


In [21]:
test_labels= test.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
test_labels.columns = [*test.columns[:-1], 'weight']
test_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-10-19,1,00s,0.0
1,2008-10-19,1,1970s,0.0
2,2008-10-19,1,1980s,0.0
3,2008-10-19,1,1990s,0.0
4,2008-10-19,1,2000s,0.0
...,...,...,...,...
24818935,2009-06-12,802,westlife,0.0
24818936,2009-06-12,802,whitney houston,0.0
24818937,2009-06-12,802,world,0.0
24818938,2009-06-12,802,wu-tang,0.0


In [22]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=10)
val_labels_dask = from_pandas(val_labels, npartitions=10)
test_labels_dask = from_pandas(test_labels, npartitions=10)

In [23]:
# left join val_labels and train_labels on user_id, genre pair
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [24]:
# from dask back to pandas
df_pred_val= df_val.compute()
df_pred_val

Unnamed: 0,ts,user_id,genre,weight,weight_latest,weight_mean,weight_median
0,2008-09-04,712,proto-punk,0.0,0.000000,0.000000,0.000000
1,2008-09-04,712,psychedelic,0.0,0.033276,0.053348,0.047546
2,2008-09-04,712,queer as folk,0.0,0.000000,0.000000,0.000000
3,2008-09-04,712,radiohead,0.0,0.000000,0.000000,0.000000
4,2008-09-04,712,rap,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
2601813,2008-10-18,1000,trumpet,0.0,0.000000,0.000000,0.000000
2601814,2008-10-18,1000,upbeat,0.0,0.000000,0.000000,0.000000
2601815,2008-10-18,1000,vocal,0.0,0.000000,0.000000,0.000000
2601816,2008-10-18,1000,whitney houston,0.0,0.000000,0.000000,0.000000


In [25]:
# from dask back to pandas
df_pred_test = df_test.compute()
df_pred_test

Unnamed: 0,ts,user_id,genre,weight,weight_latest,weight_mean,weight_median
0,2009-03-26,178,00s,0.0,0.002119,0.003705,0.002518
1,2009-03-26,178,1990s,0.0,0.000000,0.000000,0.000000
2,2009-03-26,178,2008,0.0,0.000000,0.000000,0.000000
3,2009-03-26,178,80s,0.0,0.035482,0.008663,0.008011
4,2009-03-26,178,Alt-country,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
2479579,2009-06-12,802,prda,0.0,0.000000,0.000000,0.000000
2479580,2009-06-12,802,psychobilly,0.0,0.004875,0.008990,0.013630
2479581,2009-06-12,802,sexy,0.0,0.003119,0.001417,0.001457
2479582,2009-06-12,802,swedish,0.0,0.022539,0.007776,0.005455


In [26]:
del df_node
del train
del val
del test
del train_labels
del val_labels
del test_labels
import gc
gc.collect()

0

In [27]:
df_pred_val['weight_latest'] = df_pred_val['weight_latest'].fillna(0.0) # fill null values with 0.0
df_pred_val['weight_mean'] = df_pred_val['weight_mean'].fillna(0.0) # fill null values with 0.0
df_pred_val['weight_median'] = df_pred_val['weight_median'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_latest'] = df_pred_test['weight_latest'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_mean'] = df_pred_test['weight_mean'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_median'] = df_pred_test['weight_median'].fillna(0.0) # fill null values with 0.0

# NDCG Scores

Baseline: Latest node lable

In [28]:
print("The ndcg score on validation set where prediction is the latest observed node label in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_latest']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the latest observed node label in training set.
0.18068112195932068


In [29]:
print("The ndcg score on test set where prediction is the latest observed node label in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_latest']).reshape(-1,513), k=10))

The ndcg score on test set where prediction is the latest observed node label in training set.
0.15747918592709997


Model 2: Mean node label

In [30]:
print("The ndcg score on validation set where prediction is mean observed node label in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_mean']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is mean observed node label in training set.
0.2309569477820598


In [31]:
print("The ndcg score where prediction is mean of the observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_mean']).reshape(-1,513), k=10))

The ndcg score where prediction is mean of the observed node labels in training set.
0.20335101858341198


Model 3: Median node label

In [32]:
print("The ndcg score on validation set where prediction is median observed node label in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_median']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is median observed node label in training set.
0.2139516207975767


In [33]:
print("The ndcg score where prediction is the median of the observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_median']).reshape(-1,513), k=10))

The ndcg score where prediction is the median of the observed node labels in training set.
0.18883186283315886
