In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import ndcg_score
import dask.dataframe as dd
from dask.dataframe import from_pandas

In [2]:
df_node= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Erdos_tgbn_2024/data/tgbn-genre_node_labels.csv")
df_node['ts']= pd.to_datetime(df_node['ts'], unit= 's') # timestamp to datetime
df_node['ts'] = pd.to_datetime(df_node['ts'].dt.strftime('%Y-%m-%d')) # dont need time because the timestamp is reset to the beginning of day in order to calculate the frequency vector label
df_node['user_id']=df_node['user_id'].str.slice(5).astype('Int64') #convert user_id to integers
df_node # remember weight is the frequency vector entries, not the actual weight

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
2741930,2009-06-12,651,katy perry,0.004864
2741931,2009-06-12,724,bebop,0.041841
2741932,2009-06-12,724,jazz,0.958159
2741933,2009-06-12,802,Britney Spears,0.359504


#Train Validation Test Split

In [3]:
np.quantile(df_node['ts'].unique(), [0.7,0.85]) #70-15-15 split of the timestamps

array(['2008-02-24T14:23:59.999999993', '2008-10-18T07:11:59.999999996'],
      dtype='datetime64[ns]')

In [4]:
train= df_node.loc[df_node['ts']<datetime(2008,2,25)]
val= df_node.loc[(datetime(2008,2,25)<=df_node['ts']) & (df_node['ts']<=datetime(2008,10,18))]
test= df_node.loc[datetime(2008,10,18)<df_node['ts']]

# Some data preprocessing

In [5]:
set(df_node['genre'])-set(train['genre'])

{'jazmine sullivan'}

In [6]:
row = {'ts':datetime(2008,2,24), 'user_id': 995, 'genre':'jazmine sullivan','weight':0.0}
train = train._append(row, ignore_index = True)
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1638966,2008-02-24,995,acoustic,0.146607
1638967,2008-02-24,995,soul,0.088105
1638968,2008-02-24,995,jazz,0.050220
1638969,2008-02-24,995,female vocalist,0.044934


In [7]:
df_node['genre'].nunique()==train['genre'].nunique()

True

#Exponential Smoothing ($\alpha=0.8$)

In [64]:
train_labels=train.groupby(['user_id','genre']).apply(lambda x: x['weight'].ewm(alpha=0.8).mean().iloc[-1]).reset_index()
train_labels.rename(columns={0:'weight_.8'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_.8
0,1,80s,0.344359
1,1,Coldplay,0.043350
2,1,Drum and bass,0.071254
3,1,Grunge,0.136364
4,1,Lo-Fi,0.069268
...,...,...,...
86661,1000,soul,0.240176
86662,1000,swing,0.005391
86663,1000,trip hop,0.019362
86664,1000,turntablism,0.011806


Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [65]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_.8"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 787*513=431433

Unnamed: 0,user_id,genre,weight_.8
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
403726,1000,westlife,0.0
403727,1000,whitney houston,0.0
403728,1000,world,0.0
403729,1000,wu-tang,0.0


## Inference
Preparing the validation and test sets. We need to make sure that each timestamp, user pair has 513 genre entries. We put 0.0 for entries not in the dataset. 0.0 means no interaction.

In [8]:
val_labels= val.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
val_labels.columns = [*val.columns[:-1], 'weight']
val_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-02-25,2,00s,0.0
1,2008-02-25,2,1970s,0.0
2,2008-02-25,2,1980s,0.0
3,2008-02-25,2,1990s,0.0
4,2008-02-25,2,2000s,0.0
...,...,...,...,...
26127598,2008-10-18,1000,westlife,0.0
26127599,2008-10-18,1000,whitney houston,0.0
26127600,2008-10-18,1000,world,0.0
26127601,2008-10-18,1000,wu-tang,0.0


In [9]:
test_labels= test.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
test_labels.columns = [*test.columns[:-1], 'weight']
test_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-10-19,1,00s,0.0
1,2008-10-19,1,1970s,0.0
2,2008-10-19,1,1980s,0.0
3,2008-10-19,1,1990s,0.0
4,2008-10-19,1,2000s,0.0
...,...,...,...,...
24818935,2009-06-12,802,westlife,0.0
24818936,2009-06-12,802,whitney houston,0.0
24818937,2009-06-12,802,world,0.0
24818938,2009-06-12,802,wu-tang,0.0


In [68]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=3)

In [10]:
val_labels_dask = from_pandas(val_labels, npartitions=3)
test_labels_dask = from_pandas(test_labels, npartitions=3)

In [69]:
# left join val_labels and train_labels on user_id, genre pair
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [70]:
# from dask back to pandas
df_pred_val= df_val.compute()
df_pred_val

Unnamed: 0,ts,user_id,genre,weight,weight_.8
0,2008-02-25,2,00s,0.0,0.000000
1,2008-02-25,2,2000s,0.0,0.012630
2,2008-02-25,2,2009,0.0,0.000000
3,2008-02-25,2,3 Doors Down,0.0,0.302756
4,2008-02-25,2,30 seconds to mars,0.0,0.006932
...,...,...,...,...,...
8666011,2008-10-18,1000,viking metal,0.0,0.000000
8666012,2008-10-18,1000,violin,0.0,0.000000
8666013,2008-10-18,1000,vocal jazz,0.0,0.000000
8666014,2008-10-18,1000,westlife,0.0,0.000000


In [71]:
# from dask back to pandas
df_pred_test = df_test.compute()
df_pred_test

Unnamed: 0,ts,user_id,genre,weight,weight_.8
0,2008-10-19,1,00s,0.0,0.0
1,2008-10-19,1,1970s,0.0,0.0
2,2008-10-19,1,1980s,0.0,0.0
3,2008-10-19,1,2000s,0.0,0.0
4,2008-10-19,1,2007,0.0,0.0
...,...,...,...,...,...
8232870,2009-06-12,802,violin,0.0,0.0
8232871,2009-06-12,802,vocal jazz,0.0,0.0
8232872,2009-06-12,802,whitney houston,0.0,0.0
8232873,2009-06-12,802,wu-tang,0.0,0.0


In [None]:
df_pred_val['weight_.8'] = df_pred_val['weight_.8'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_.8'] = df_pred_test['weight_.8'].fillna(0.0) # fill null values with 0.0

# NDCG Scores

In [74]:
print("The ndcg score on validation set where prediction is the rolling average (window =7) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_.8']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the rolling average (window =7) of observed node labels in training set.
0.16418500996248844


In [75]:
print("The ndcg score on test set where prediction is the rolling average (window =7) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_.8']).reshape(-1,513), k=10))

The ndcg score on test set where prediction is the rolling average (window =7) of observed node labels in training set.
0.14534663008921228


#Exponential Smoothing ($\alpha=0.4$)

In [11]:
train_labels=train.groupby(['user_id','genre']).apply(lambda x: x['weight'].ewm(alpha=0.4).mean().iloc[-1]).reset_index()
train_labels.rename(columns={0:'weight_.4'}, inplace=True)
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight_.4
0,1,80s,0.286170
1,1,Coldplay,0.050338
2,1,Drum and bass,0.093572
3,1,Grunge,0.136364
4,1,Lo-Fi,0.069268
...,...,...,...
86661,1000,soul,0.215627
86662,1000,swing,0.005391
86663,1000,trip hop,0.019362
86664,1000,turntablism,0.011806


In [12]:
train_labels= train_labels.groupby(["user_id",'genre'])[["weight_.4"]].first().unstack(fill_value=0.0).stack().reset_index()
train_labels # note 787*513=431433

Unnamed: 0,user_id,genre,weight_.4
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
403726,1000,westlife,0.0
403727,1000,whitney houston,0.0
403728,1000,world,0.0
403729,1000,wu-tang,0.0


## Inference
Preparing the validation and test sets. We need to make sure that each timestamp, user pair has 513 genre entries. We put 0.0 for entries not in the dataset. 0.0 means no interaction.

In [13]:
# need to use dask for merging. pandas cannot handle large datasets with current colab memory.
train_labels_dask = from_pandas(train_labels, npartitions=3)

In [14]:
# left join val_labels and train_labels on user_id, genre pair
df_val= dd.merge(val_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])
df_test = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [15]:
# from dask back to pandas
df_pred_val= df_val.compute()
df_pred_test = df_test.compute()

In [18]:
df_pred_val['weight_.4'] = df_pred_val['weight_.4'].fillna(0.0) # fill null values with 0.0
df_pred_test['weight_.4'] = df_pred_test['weight_.4'].fillna(0.0) # fill null values with 0.0

## NDCG Scores

In [19]:
print("The ndcg score on validation set where prediction is the rolling average (window =7) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_val['weight']).reshape(-1,513),np.array(df_pred_val['weight_.4']).reshape(-1,513), k=10))

The ndcg score on validation set where prediction is the rolling average (window =7) of observed node labels in training set.
0.18272469851787315


In [20]:
print("The ndcg score on test set where prediction is the rolling average (window =7) of observed node labels in training set.")
print(ndcg_score(np.array(df_pred_test['weight']).reshape(-1,513),np.array(df_pred_test['weight_.4']).reshape(-1,513), k=10))

The ndcg score on test set where prediction is the rolling average (window =7) of observed node labels in training set.
0.1619109434315392
