In [None]:
!pip install tensorflow_addons

Collecting tensorflow_addons
[?25l  Downloading https://files.pythonhosted.org/packages/2e/af/0ce633c373d2b0476ef8299673d22275fcc3c5ba283b2cec4aa06bc5b810/tensorflow_addons-0.12.1-cp36-cp36m-manylinux2010_x86_64.whl (703kB)
[K     |▌                               | 10kB 19.3MB/s eta 0:00:01[K     |█                               | 20kB 26.2MB/s eta 0:00:01[K     |█▍                              | 30kB 26.4MB/s eta 0:00:01[K     |█▉                              | 40kB 23.0MB/s eta 0:00:01[K     |██▎                             | 51kB 24.2MB/s eta 0:00:01[K     |██▉                             | 61kB 17.4MB/s eta 0:00:01[K     |███▎                            | 71kB 17.6MB/s eta 0:00:01[K     |███▊                            | 81kB 18.3MB/s eta 0:00:01[K     |████▏                           | 92kB 16.3MB/s eta 0:00:01[K     |████▋                           | 102kB 17.5MB/s eta 0:00:01[K     |█████▏                          | 112kB 17.5MB/s eta 0:00:01[K     |████

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import  Dense, Dropout, Conv1D, Flatten, GRU, MaxPooling1D, InputLayer
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')

In [None]:
data = pd.read_parquet('full_dataset.parquet', engine='pyarrow')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
data.head()

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
0,182576,0,2020-09-22 14:38:14,1_5,2,,,2020-09-24 02:03:53
1,182576,0,2020-09-22 14:38:14,1_4,2_9,3_5,,2020-09-24 02:03:53
2,182576,0,2020-09-22 11:46:56,1_4,2_9,3_6,,2020-09-24 02:03:53
3,182576,0,2020-09-22 11:23:50,1_4,2_9,3_12,,2020-09-24 02:03:53
4,182576,0,2020-09-22 11:21:29,1_4,2_9,3_6,,2020-09-24 02:03:53


In [None]:
data.sort_values(by=['session_id','event_timestamp'], inplace=True)

In [None]:
data['tobi_timestamp'] = (data['tobi_timestamp'] - min(data['tobi_timestamp'])).apply(lambda td: td.delta) * 1e-9

In [None]:
minimum_date = data.groupby('user_identifier', sort=False).min(numeric_only=True)

In [None]:
data

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
4134595,6,237376,2020-07-15 08:47:31,1_0,2_3,,,4445182.0
4134594,6,237376,2020-07-15 20:08:52,1_0,2_2,,,4445182.0
4134593,6,237376,2020-07-15 20:09:20,1_0,2_4,,,4445182.0
4134592,6,237376,2020-07-15 20:09:31,1_0,2_4,,,4445182.0
4134591,6,237376,2020-07-15 20:10:15,1_0,2_3,,,4445182.0
...,...,...,...,...,...,...,...,...
2883978,1606201,97349,2020-09-23 17:51:40,1_4,2_9,3_5,,9890853.0
5532706,1606217,701497,2020-06-18 11:25:00,1_3,2_6,,,1540260.0
5676991,1606224,871126,2020-06-01 00:00:00,1_2,2_1,3_2,,123330.0
1431141,1606226,25526,2020-06-04 13:24:00,1_1,2_0,3_0,,323692.0


In [None]:
filt = data['event_category_idx_lv2'].apply(lambda s: s.isnumeric() if isinstance(s, str) else False)
to_drop = data[filt]

In [None]:
data.drop(index=to_drop.index, inplace=True)

In [None]:
grouped_by_idx = data.groupby(['event_category_idx_lv2', 'event_category_idx_lv3'])['event_category_idx_lv4'].unique().apply(lambda s: s[0] if s[0] == None else s).dropna()

In [None]:
len_grouped_by_idx = grouped_by_idx.apply(lambda s: len(s))
drop_4th = []
for i in len_grouped_by_idx.index:
    if len_grouped_by_idx[i] == 1:
        drop_4th.append(i)
drop_4th = np.array(drop_4th)

In [None]:
drop_3rd_and_4th = []
for i, j in drop_4th:
    if len(len_grouped_by_idx[i]) == 1:
        drop_3rd_and_4th.append(i)

In [None]:
filt = data['event_category_idx_lv2'].isin(drop_3rd_and_4th)
data.loc[filt, ['event_category_idx_lv3', 'event_category_idx_lv4']] = None

In [None]:
filt = (data['event_category_idx_lv2'].isin(drop_4th[:,0])) & (data['event_category_idx_lv3'].isin(drop_4th[:,1]))
data.loc[filt, 'event_category_idx_lv4'] = None

In [None]:
baseline_date = pd.Timestamp(year=2020, month=4, day=14)

In [None]:
data['event_timestamp'] = (data['event_timestamp'] - baseline_date).apply(lambda ts: ts.delta) * 1e-9 

In [None]:
event_categories = ['event_category_idx_lv1', 'event_category_idx_lv2', 'event_category_idx_lv3', 'event_category_idx_lv4']
def filt(s):
    if isinstance(s, str):
        return s if s.isnumeric() else s[2:]
    return -1
for col in event_categories:
    data[col] = data[col].apply(filt)

In [None]:
data

Unnamed: 0,session_id,user_identifier,event_timestamp,event_category_idx_lv1,event_category_idx_lv2,event_category_idx_lv3,event_category_idx_lv4,tobi_timestamp
4134595,6,237376,7980451.0,0,3,-1,-1,4445182.0
4134594,6,237376,8021332.0,0,2,-1,-1,4445182.0
4134593,6,237376,8021360.0,0,4,-1,-1,4445182.0
4134592,6,237376,8021371.0,0,4,-1,-1,4445182.0
4134591,6,237376,8021415.0,0,3,-1,-1,4445182.0
...,...,...,...,...,...,...,...,...
2883978,1606201,97349,14061100.0,4,9,5,-1,9890853.0
5532706,1606217,701497,5657100.0,3,6,-1,-1,1540260.0
5676991,1606224,871126,4147200.0,2,1,2,-1,123330.0
1431141,1606226,25526,4454640.0,1,0,0,-1,323692.0


In [None]:
training_data = data[data['session_id'].isin(train['session_id'])]

In [None]:
testing_data = data[data['session_id'].isin(test['session_id'])]

In [None]:
n_sequences = data['session_id'].nunique()
n_sequences

347837

In [None]:
def reshape_sequences(dataset, series):
    n_sequences = series['session_id'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    sequences = np.zeros((n_sequences, max_length, 7))
    
    i = 0
    grouped = dataset.groupby('session_id').apply(np.array)
    for seq in grouped:
        l = len(seq)
        sequences[i,:l] = seq[:,1:]
        i += 1
    return sequences

In [None]:
def reshape_sequences(dataset, series):
    n_sequences = series['session_id'].nunique()
    max_length = max(dataset['session_id'].value_counts())
    sequences = -np.ones((n_sequences, max_length+2, 5))
    
    i = 0
    grouped = dataset.groupby('session_id').apply(np.array)
    for seq in grouped:
        l = len(seq)
        sequences[i,0,0] = seq[0,1]
        sequences[i,1,0] = seq[0,-1] - minimum_date.loc[seq[0,1], 'tobi_timestamp']
        sequences[i,2:l+2] = seq[:,2:-1]
        i += 1
    return sequences

In [None]:
training = reshape_sequences(training_data, train)

In [None]:
testing = reshape_sequences(testing_data, test)

In [None]:
labels = (train['label'].to_numpy() - 1)  # REMEMBER TO DO +1 ON PREDICTION WHEN SUBMITTING!

In [None]:
X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(training, labels, train_size=0.7, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, train_size=0.5, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)

In [None]:
scaler.mean_

array([ 8.50009556e+04,  4.33559259e+06,  1.17139898e+00,  4.02938765e+00,
        7.64291033e-01, -2.28636426e-01,  3.23954514e+06])

In [None]:
def build_model(filters=128, kernel_size=3):
    model = Sequential()
    n_layers = int(np.ceil(np.log2(30 / (kernel_size - 1))))
    dilation_schedule = [2**k for k in range(1, n_layers)]

    model.add(Conv1D(filters, kernel_size, padding='causal', dilation_rate=1, activation='relu', input_shape=(30, 7)))
    for rate in dilation_schedule:
        model.add(WeightNormalization(Conv1D(filters, kernel_size, padding='causal', dilation_rate=rate, activation='relu')))
        model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(4, activation='softmax'))

    opt = Adam(learning_rate=1e-3, amsgrad=True)
    model.compile(
        optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def build_model(filters=128, kernel_size=3):
    model = Sequential()
    n_layers = int(np.ceil(np.log2(30 / (kernel_size - 1))))
    dilation_schedule = [2**k for k in range(1, n_layers)]

    model.add(GRU(120, return_sequences=True, dropout=0.3))
    model.add(Conv1D(filters, kernel_size, padding='causal', dilation_rate=1, activation='relu', input_shape=(30, 7)))
    for rate in dilation_schedule:
        model.add(WeightNormalization(Conv1D(filters, kernel_size, padding='causal', dilation_rate=rate, activation='relu')))
        model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(4, activation='softmax'))

    opt = Adam(learning_rate=1e-3, amsgrad=True)
    model.compile(
        optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model()

In [None]:
model = KerasClassifier(build_model)
batch_sizes = [32, 64, 128, 256]
epochs = [20, 40, 60]
filters = [64, 128, 256, 512]
kernel_sizes = [3, 4, 5, 6, 7, 8]
param_grid = {'batch_size': batch_sizes, 'epochs': [20]} # 'batch_size': batch_sizes 'filters': filters, 'kernel_size': kernel_sizes}

grid = GridSearchCV(model, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)

In [None]:
model = grid.best_estimator_

In [None]:
def build_recurrent():
    model = Sequential()

    model.add(GRU(240, return_sequences=False, dropout=0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(4, activation='softmax'))

    opt = Adam(learning_rate=1e-3, amsgrad=True)
    model.compile(
        optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_recurrent()

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=64)

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_10 (Conv1D)           (None, 30, 128)           2816      
_________________________________________________________________
weight_normalization_8 (Weig (None, 30, 128)           98689     
_________________________________________________________________
dropout_10 (Dropout)         (None, 30, 128)           0         
_________________________________________________________________
weight_normalization_9 (Weig (None, 30, 128)           98689     
_________________________________________________________________
dropout_11 (Dropout)         (None, 30, 128)           0         
_________________________________________________________________
weight_normalization_10 (Wei (None, 30, 128)           98689     
_________________________________________________________________
dropout_12 (Dropout)         (None, 30, 128)          

In [None]:
model.evaluate(X_test, y_test)



[1.1418743133544922, 0.5082339644432068]

In [None]:
model.save_weights('weights_conv1.h5')

In [None]:
model.load_weights('weights_conv1.h5')

array([[0.2554901 , 0.34854382, 0.13163275, 0.26433325],
       [0.20112702, 0.37006223, 0.18135682, 0.24745393],
       [0.2608669 , 0.36554268, 0.27034986, 0.1032406 ],
       ...,
       [0.09008574, 0.06119197, 0.8190936 , 0.02962869],
       [0.42001393, 0.17952359, 0.25247282, 0.14798968],
       [0.22441763, 0.27457747, 0.39391667, 0.10708832]], dtype=float32)

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=1)
confusion_matrix(y_test, y_pred)

array([[2124, 4989, 1053,  216],
       [1100, 9416,  653,  309],
       [ 716, 1923, 3105,   78],
       [ 567, 3186,  388, 1450]])

In [None]:
l = len(labels)
for i in range(4):
    print(f'samples from class {i}: {np.sum(labels == i) / l} %')

samples from class 0: 0.26656945790125 %
samples from class 1: 0.3656312654087085 %
samples from class 2: 0.1892597104841572 %
samples from class 3: 0.17853956620588432 %


In [None]:
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)
test_pred = model.predict(X_test)

In [None]:
def build_stacked(layers=2, neurons=32, dropout=0.2):
    model = Sequential()

    for layer in range(layers):
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
    model.add(Dense(4, activation='softmax'))

    opt = Adam(learning_rate=5e-4)
    model.compile(
        optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
stacked = build_stacked()

In [None]:
stacked.summary()

In [None]:
model = KerasClassifier(build_stacked)
batch_sizes = [32, 64, 128, 256]
layers = [2,3,4,5]
param_grid = {'layers': layers, 'epochs': [10]} # 'batch_size': batch_sizes 'filters': filters, 'kernel_size': kernel_sizes}

grid = GridSearchCV(model, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)

In [None]:
stacked.fit(train_pred, y_train, validation_data=(val_pred, y_val), epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
 431/1851 [=====>........................] - ETA: 3s - loss: 1.0931 - accuracy: 0.5252

KeyboardInterrupt: ignored