In [54]:
!pip install tensorflow_addons



In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import  Dense, Dropout, Conv1D, Flatten, Input
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import swish
from keras.layers.merge import concatenate

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')

In [58]:
data = pd.read_parquet('full_dataset.parquet', engine='pyarrow')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [59]:
data.head()

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
0,182576,0,2020-09-22 14:38:14,1_5,2,,,2020-09-24 02:03:53
1,182576,0,2020-09-22 14:38:14,1_4,2_9,3_5,,2020-09-24 02:03:53
2,182576,0,2020-09-22 11:46:56,1_4,2_9,3_6,,2020-09-24 02:03:53
3,182576,0,2020-09-22 11:23:50,1_4,2_9,3_12,,2020-09-24 02:03:53
4,182576,0,2020-09-22 11:21:29,1_4,2_9,3_6,,2020-09-24 02:03:53


In [60]:
data.sort_values(by=['session_id','event_timestamp'], inplace=True)

In [61]:
baseline_date = pd.Timestamp(year=2020, month=4, day=14)

In [62]:
data['event_timestamp'] = (data['event_timestamp'] - baseline_date).apply(lambda ts: ts.delta) * 1e-9 

In [63]:
data['tobi_timestamp'] = (data['tobi_timestamp'] - min(data['tobi_timestamp'])).apply(lambda td: td.delta) * 1e-9

In [64]:
filt = data['event_category_idx_lv2'].apply(lambda s: s.isnumeric() if isinstance(s, str) else False)
to_drop = data[filt]

In [65]:
data.drop(index=to_drop.index, inplace=True)

In [66]:
event_categories = ['event_category_idx_lv1', 'event_category_idx_lv2', 'event_category_idx_lv3', 'event_category_idx_lv4']
def filt(s):
    if isinstance(s, str):
        return s if s.isnumeric() else s[2:]
    return -1
for col in event_categories:
    data[col] = data[col].apply(filt)

In [67]:
data

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
4134595,6,237376,7980451.0,0,3,-1,-1,4445182.0
4134594,6,237376,8021332.0,0,2,-1,-1,4445182.0
4134593,6,237376,8021360.0,0,4,-1,-1,4445182.0
4134592,6,237376,8021371.0,0,4,-1,-1,4445182.0
4134591,6,237376,8021415.0,0,3,-1,-1,4445182.0
...,...,...,...,...,...,...,...,...
2883978,1606201,97349,14061100.0,4,9,5,-1,9890853.0
5532706,1606217,701497,5657100.0,3,6,-1,-1,1540260.0
5676991,1606224,871126,4147200.0,2,1,2,-1,123330.0
1431141,1606226,25526,4454640.0,1,0,0,1,323692.0


In [68]:
training_data = data[data['session_id'].isin(train['session_id'])]

In [69]:
testing_data = data[data['session_id'].isin(test['session_id'])]

In [70]:
y_train, y_test_and_val = train_test_split(train, train_size=0.7, random_state=42)
y_test, y_val = train_test_split(y_test_and_val, train_size=0.5, random_state=42)

In [71]:
for df in [y_train, y_test, y_val]:
    df.sort_values(by='session_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [72]:
train_df = data[data['session_id'].isin(y_train['session_id'])]
test_df = data[data['session_id'].isin(y_test['session_id'])]
val_df = data[data['session_id'].isin(y_val['session_id'])]

In [73]:
n_sequences = data['session_id'].nunique()
n_sequences

347837

In [74]:
def reshape_inputs(dataset, users_history):
    n_sequences = dataset['session_id'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    unique_users = users_history[0].unique()
    sequences = -np.ones((n_sequences, max_length, 7))
    user_data = np.zeros((n_sequences, 4))

    i = 0
    grouped = dataset.groupby('session_id').apply(np.array)
    for seq in grouped:
        l = len(seq)
        user = seq[0,1]
        sequences[i,:l] = seq[:,1:]
        if user in unique_users:
            user_data[i,:] = users_history.loc[users_history[0] == user, [1,2,3,4]]
        i += 1
    return sequences, user_data

In [76]:
def get_user_data(dataset, series):
    user_sessions = dataset.groupby('session_id')['user_identifier'].unique().apply(lambda s: s[0]).reset_index()
    user_sessions['label'] = series.sort_values(by='session_id').reset_index()['label']
    unique_users = np.sort(dataset['user_identifier'].unique())
    labels_by_user = user_sessions.groupby('user_identifier')['label'].value_counts()

    labels_count_by_user = np.zeros((124382,5), dtype=np.float32)
    j = 0
    for user in unique_users:
        labels_count_by_user[j,0] = user
        for i in labels_by_user[user].index:
            labels_count_by_user[j,i] = labels_by_user[user,i]
        j += 1
    
    return pd.DataFrame(labels_count_by_user)

In [None]:
train_labels_by_user = get_user_data(training_data, train)

In [78]:
X_train_1, X_train_2 = reshape_inputs(train_df, train_labels_by_user)
X_test_1, X_test_2 = reshape_inputs(test_df, train_labels_by_user)
X_val_1, X_val_2 = reshape_inputs(val_df, train_labels_by_user)

In [79]:
y_train = y_train['label'] - 1 # REMEMBER TO DO +1 WHEN SUBMITTING
y_test = y_test['label'] - 1
y_val = y_val['label'] - 1

In [80]:
i = 0
for label in y_train:
    X_train_2[i,label] -= 1
    i += 1

In [81]:
scaler = StandardScaler()
X_train_1 = scaler.fit_transform(X_train_1.reshape(-1, X_train_1.shape[-1])).reshape(X_train_1.shape)
X_test_1 = scaler.transform(X_test_1.reshape(-1, X_test_1.shape[-1])).reshape(X_test_1.shape)
X_val_1 = scaler.transform(X_val_1.reshape(-1, X_val_1.shape[-1])).reshape(X_val_1.shape)

In [82]:
X_train_2 = scaler.fit_transform(X_train_2)
X_test_2 = scaler.transform(X_test_2)
X_val_2 = scaler.transform(X_val_2)

In [97]:
returning_users_test = pd.Series(X_test_1[:,0,0]).isin(X_train_1[:,0,0])
returning_users_val = pd.Series(X_val_1[:,0,0]).isin(X_train_1[:,0,0])   

In [92]:
def build_model(filters=128, kernel_size=3): # BEST ONE, 40/60 epochs, dropout= 0.4 or 0.5, with 0.5 dropout 100/120 epochs
    input2 = Input(shape=(4))

    input3 = Input(shape=(30,7))

    n_layers = int(np.ceil(np.log2(30 / (kernel_size - 1))))
    dilation_schedule = [2**k for k in range(1, n_layers)]

    conv = WeightNormalization(Conv1D(filters, kernel_size, padding='causal', dilation_rate=1, 
                                      activation=swish))(input3)
    conv = Dropout(0.4)(conv)
    for rate in dilation_schedule:
        conv = WeightNormalization(Conv1D(filters, kernel_size, padding='causal', dilation_rate=rate, 
                                          activation=swish))(conv)
        conv = Dropout(0.4)(conv)
    flattened = Flatten()(conv)

    concatenated = concatenate([input2, flattened])
    dense = Dense(2048, activation='tanh')(concatenated)
    dropout = Dropout(0.4)(dense)
    dense = Dense(2048, activation='tanh')(dropout)
    dropout = Dropout(0.4)(dense)
    out = Dense(4, activation='softmax')(dropout)
    model = Model([input3, input2], out)

    opt = Adam(learning_rate=5e-4, amsgrad=True)
    model.compile(
        optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [93]:
model = build_model()

In [42]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 30, 7)]      0                                            
__________________________________________________________________________________________________
weight_normalization (WeightNor (None, 30, 128)      5761        input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 30, 128)      0           weight_normalization[0][0]       
__________________________________________________________________________________________________
weight_normalization_1 (WeightN (None, 30, 128)      98689       dropout[0][0]                    
______________________________________________________________________________________________

In [None]:
history = model.fit([X_train_1, X_train_2], y_train, validation_data=([X_val_1, X_val_2], y_val), epochs=80, batch_size=64)

In [101]:
model.evaluate([X_test_1, X_test_2], y_test)



[1.105362057685852, 0.5228791832923889]

In [102]:
model.evaluate([X_test_1[returning_users_test,:,:], X_test_2[returning_users_test,:]], y_test[(returning_users_test).to_numpy()])



[0.9310709238052368, 0.6376418471336365]

In [103]:
model.evaluate([X_test_1[~returning_users_test,:,:], X_test_2[~returning_users_test,:]], y_test[(~returning_users_test).to_numpy()])



[1.1663661003112793, 0.48271098732948303]