In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import ndcg_score
import dask.dataframe as dd
from dask.dataframe import from_pandas

In [35]:
df_node= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Erdos_tgbn_2024/data/tgbn-genre_node_labels.csv")
df_node['ts']= pd.to_datetime(df_node['ts'], unit= 's') # timestamp to datetime
df_node['ts'] = pd.to_datetime(df_node['ts'].dt.strftime('%Y-%m-%d')) # dont need time because the timestamp is reset to the beginning of day in order to calculate the frequency vector label
df_node['user_id']=df_node['user_id'].str.slice(5).astype('Int64') #convert user_id to integers
df_node # remember weight is the frequency vector entries, not the actual weight

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
2741930,2009-06-12,651,katy perry,0.004864
2741931,2009-06-12,724,bebop,0.041841
2741932,2009-06-12,724,jazz,0.958159
2741933,2009-06-12,802,Britney Spears,0.359504


In [36]:
df_node.user_id.nunique()

974

# Train Validation Test Split

In [4]:
np.quantile(range(2741935), [0.7,0.85]) #70-15-15 split of the rows. Would have been more sensible to split the timestamp
# But to follow the paper' result we split like this

array([1919353.8, 2330643.9])

In [18]:
print(df_node['ts'][1919354])
print(df_node['ts'][2330644])

2008-06-27 00:00:00
2008-12-07 00:00:00


In [24]:
train= df_node.loc[df_node['ts']<datetime(2008,6,28)]
val= df_node.loc[(datetime(2008,6,28)<=df_node['ts']) & (df_node['ts']<=datetime(2008,12,7))]
test= df_node.loc[datetime(2008,12,7)<df_node['ts']]

In [54]:
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1920443,2008-06-27,999,juno,0.007919
1920444,2008-06-27,999,Soundtrack,0.007285
1920445,2008-06-27,999,post rock,0.020162
1920446,2008-06-27,999,icelandic,0.018164


In [29]:
print("The number of unique users in training set is {}".format(train['user_id'].nunique()))
print("The number of unique users in validation set is {}".format(val['user_id'].nunique()))
print("The number of unique users in test set is {}".format(test['user_id'].nunique()))

The number of unique users in training set is 841
The number of unique users in validation set is 556
The number of unique users in test set is 579


In [30]:
print("The validation users are in the training set: {}".format(set(val['user_id']).issubset(set(train['user_id']))))
print("The test users are in the training set: {}".format(set(test['user_id']).issubset(set(train['user_id']))))
# this is a problem

The validation users are in the training set: False
The test users are in the training set: False


In [39]:
print("The number of users in validation set but not in the training set is {}".format(len(set(val['user_id'])-set(train['user_id']))))
print("The number of users in test set but not in the training set is {}".format(len(set(test['user_id'])-set(train['user_id']))))
print("The number of users in test set but not in the validation set is {}".format(len(set(test['user_id'])-set(val['user_id']))))

The number of users in validation set but not in the training set is 68
The number of users in test set but not in the training set is 127
The number of users in test set but not in the validation set is 90


# Some data cleaning

In [52]:
print(train_labels['genre'].nunique())
print(df_node['genre'].nunique())
# This is a problem

512
513


In [55]:
set(df_node['genre'])-set(train_labels['genre'])

{'jazmine sullivan'}

We will augment an auxillary row for processing

In [56]:
row = {'ts':datetime(2008,6,27), 'user_id': 999, 'genre':'jazmine sullivan','weight':0.0}
train = train._append(row, ignore_index = True)
train

Unnamed: 0,ts,user_id,genre,weight
0,2005-02-15,54,chillout,0.015835
1,2005-02-15,54,female vocalist,0.015330
2,2005-02-15,54,downtempo,0.008128
3,2005-02-15,54,electronic,0.072162
4,2005-02-15,54,reggae,0.021465
...,...,...,...,...
1920444,2008-06-27,999,Soundtrack,0.007285
1920445,2008-06-27,999,post rock,0.020162
1920446,2008-06-27,999,icelandic,0.018164
1920447,2008-06-27,999,ambient,0.007128


In [61]:
df_node['genre'].nunique()==train['genre'].nunique()

True

# Model 1:  Baseline prediction
We use the last seen user label vector in the training set to predict the validation and test labels

In [63]:
train_labels= train.groupby(['user_id','genre']).agg({'weight': 'last'}).reset_index() #last will give the latest weight
train_labels # we will use this to make prediction

Unnamed: 0,user_id,genre,weight
0,1,80s,0.400000
1,1,Coldplay,0.081038
2,1,Drum and bass,0.050248
3,1,Grunge,0.136364
4,1,Hip-Hop,0.112609
...,...,...,...
96157,1000,swing,0.005391
96158,1000,techno,0.007496
96159,1000,trip hop,0.023990
96160,1000,turntablism,0.011806


# Note that not all users have weight for all 513 genres. We need to fill the missing ones with 0.0.

In [65]:
train_labels= train_labels.groupby(["user_id",'genre'])["weight"].first().unstack(fill_value=0.0).stack().reset_index()
train_labels.columns = [*train_labels.columns[:-1], 'weight']
train_labels # note 841*512=431433

Unnamed: 0,user_id,genre,weight
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
431428,1000,westlife,0.0
431429,1000,whitney houston,0.0
431430,1000,world,0.0
431431,1000,wu-tang,0.0


In [70]:
train_labels.rename(columns={'weight':'weight_train'}, inplace=True)
train_labels

Unnamed: 0,user_id,genre,weight_train
0,1,00s,0.0
1,1,1970s,0.0
2,1,1980s,0.0
3,1,1990s,0.0
4,1,2000s,0.0
...,...,...,...
431428,1000,westlife,0.0
431429,1000,whitney houston,0.0
431430,1000,world,0.0
431431,1000,wu-tang,0.0


#Model 1: Baseline
Using last seen label in the training set to make prediction on test set

In [66]:
test_labels= test.groupby(["ts", "user_id",'genre'])["weight"].first().unstack(fill_value=0).stack().reset_index()
test_labels.columns = [*test.columns[:-1], 'weight']
test_labels

Unnamed: 0,ts,user_id,genre,weight
0,2008-12-08,1,00s,0.0
1,2008-12-08,1,1970s,0.0
2,2008-12-08,1,1980s,0.0
3,2008-12-08,1,1990s,0.0
4,2008-12-08,1,2000s,0.0
...,...,...,...,...
18684994,2009-06-12,802,westlife,0.0
18684995,2009-06-12,802,whitney houston,0.0
18684996,2009-06-12,802,world,0.0
18684997,2009-06-12,802,wu-tang,0.0


In [71]:
train_labels_dask = from_pandas(train_labels, npartitions=10)
test_labels_dask = from_pandas(test_labels, npartitions=10)

In [72]:
df = dd.merge(test_labels_dask, train_labels_dask, how='left', on=['user_id','genre'])

In [73]:
df_pred = df.compute()
df_pred

Unnamed: 0,ts,user_id,genre,weight,weight_train
0,2008-12-25,681,amazing,0.0,0.032967
1,2008-12-25,681,atmospheric,0.0,0.000000
2,2008-12-25,681,australian,0.0,0.005625
3,2008-12-25,681,beyonce,0.0,0.000000
4,2008-12-25,681,bjm radio,0.0,0.002825
...,...,...,...,...,...
1868421,2009-06-12,802,prda,0.0,0.000000
1868422,2009-06-12,802,psychobilly,0.0,0.053121
1868423,2009-06-12,802,sexy,0.0,0.033983
1868424,2009-06-12,802,swedish,0.0,0.026089


In [75]:
df_pred.isnull()

Unnamed: 0,ts,user_id,genre,weight,weight_train
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
1868421,False,False,False,False,False
1868422,False,False,False,False,False
1868423,False,False,False,False,False
1868424,False,False,False,False,False


In [76]:
df_pred[df_pred.isnull().any(axis=1)]

Unnamed: 0,ts,user_id,genre,weight,weight_train
286,2008-12-25,708,2007,0.0,
287,2008-12-25,708,30 seconds to mars,0.0,
288,2008-12-25,708,Avant-Garde,0.0,
289,2008-12-25,708,Awesome,0.0,
290,2008-12-25,708,Celine Dion,0.0,
...,...,...,...,...,...
1868374,2009-06-12,724,rock,0.0,
1868375,2009-06-12,724,salsa,0.0,
1868376,2009-06-12,724,santana,0.0,
1868377,2009-06-12,724,sexy,0.0,


In [78]:
df_pred['weight_train'] = df_pred['weight_train'].fillna(0.0) # fill null values with 0.0

In [79]:
ndcg_score(np.array(df_pred['weight']).reshape(-1,513),np.array(df_pred['weight_train']).reshape(-1,513), k=10)

0.13143477238735699