In [1]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#경로 설정
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/운동동작분류AI경진대회')

In [4]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import os, gc, random, datetime
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load
from time import time
import scipy as sp
import scipy.fftpack

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE

Tensorflow version 2.4.1


# Preprocessing

In [5]:
# 데이터 불러오기

path = './data/'
train = pd.read_csv(path + 'train_features.csv')
train_label = pd.read_csv(path + 'train_labels.csv')
test = pd.read_csv(path + 'test_features.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [11]:
# Pre-Processing Effect on the Accuracy of Event-Based Activity Segmentation and Classification through Inertial Sensors 
# https://www.researchgate.net/publication/281836367_Pre-Processing_Effect_on_the_Accuracy_of_Event-Based_Activity_Segmentation_and_Classification_through_Inertial_Sensors

train['acc_t']  = train.apply(lambda x : (x['acc_x']**2 + x['acc_y'] **2 +  x['acc_z'] **2)**(1/2), axis=1)
test['acc_t']  = test.apply(lambda x : (x['acc_x']**2 + x['acc_y'] **2 +  x['acc_z'] **2)**(1/2), axis=1)
train['gy_t']  = train.apply(lambda x : (x['gy_x']**2 + x['gy_y'] **2 +  x['gy_z'] **2)**(1/2), axis=1)
test['gy_t']  = test.apply(lambda x : (x['gy_x']**2 + x['gy_y'] **2 +  x['gy_z'] **2)**(1/2), axis=1)

# SVM selected features
train['mean'] = train[['acc_x','acc_y']].mean(axis=1)
train['median'] = train[['acc_y', 'gy_z', 'gy_t']].median(axis=1)
train['standard_deviation'] = train[['acc_x', 'acc_y']].std(axis=1)
train['interquartile'] = train.quantile(.75, axis=1) - train.quantile(.25, axis=1)
train['FFT_acc_x'] = sp.fftpack.fft(np.array(train['acc_x']))
train['FFT_acc_y'] = sp.fftpack.fft(np.array(train['acc_y']))
train['FFT_acc_t'] = sp.fftpack.fft(np.array(train['acc_t']))
train['FFT_gy_z'] = sp.fftpack.fft(np.array(train['gy_z']))

test['mean'] = test[['acc_x','acc_y']].mean(axis=1)
test['median'] = test[['acc_y', 'gy_z', 'gy_t']].median(axis=1)
test['standard_deviation'] = test[['acc_x', 'acc_y']].std(axis=1)
test['interquartile'] = test.quantile(.75, axis=1) - test.quantile(.25, axis=1)
test['FFT_acc_x'] = sp.fftpack.fft(np.array(test['acc_x']))
test['FFT_acc_y'] = sp.fftpack.fft(np.array(test['acc_y']))
test['FFT_acc_t'] = sp.fftpack.fft(np.array(test['acc_t']))
test['FFT_gy_z'] = sp.fftpack.fft(np.array(test['gy_z']))

In [15]:
x = np.array(train.iloc[:,2:]).reshape(-1, 600, 16)
y = tf.keras.utils.to_categorical(train_label['label'])
test = np.array(test.iloc[:,2:]).reshape(-1, 600, 16)

# Training

Base Transformer structure from https://www.tensorflow.org/tutorials/text/transformer, modified with Swish activation function.

In [16]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.

    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b = True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
    
    def __init__(self, d_model, num_heads):
        
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm = [0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm = [0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
        return output, attention_weights

def point_wise_feed_forward_network(d_model, dff):
    
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation = 'relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

class EncoderLayer(tf.keras.layers.Layer):
    
    def __init__(self, d_model, num_heads, dff, rate = 0.1):
        
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training = training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training = training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

class TransformerEncoder(tf.keras.layers.Layer):
    
    def __init__(self, num_layers, d_model, num_heads, dff, 
                 maximum_position_encoding, rate = 0.1):
        
        super(TransformerEncoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.maximum_position_encoding = maximum_position_encoding
        self.rate = rate

#         self.pos_encoding = positional_encoding(self.maximum_position_encoding, 
#                                                 self.d_model)
#         self.embedding = tf.keras.layers.Dense(self.d_model)
        self.pos_emb = tf.keras.layers.Embedding(input_dim = self.maximum_position_encoding, 
                                                 output_dim = self.d_model)

        self.enc_layers = [EncoderLayer(self.d_model, self.num_heads, self.dff, self.rate) 
                           for _ in range(self.num_layers)]

        self.dropout = tf.keras.layers.Dropout(self.rate)
        
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'num_layers': self.num_layers,
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'maximum_position_encoding': self.maximum_position_encoding,
            'dropout': self.dropout,
        })
        return config

    def call(self, x, training, mask = None):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
#         x += self.pos_encoding[:, :seq_len, :]
#         x = self.embedding(x)
        positions = tf.range(start = 0, limit = seq_len, delta = 1)
        x += self.pos_emb(positions)

        x = self.dropout(x, training = training)

        for i in range(self.num_layers):

            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

In [17]:
def create_transformer_model(num_columns, num_labels, num_layers, d_model, num_heads, dff, window_size, dropout_rate, weight_decay, label_smoothing, learning_rate):
    
    inp = tf.keras.layers.Input(shape = (window_size, num_columns))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dense(d_model)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.SpatialDropout1D(dropout_rate)(x)
    x = TransformerEncoder(num_layers, d_model, num_heads, dff, window_size, dropout_rate)(x)
    out = tf.keras.layers.Dense(num_labels, activation = 'softmax')(x[:, -1, :])
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['AUC'])
    
    return model

In [18]:
batch_size = 64
num_layers = 1
d_model = 128
num_heads = 1
dff = 128
window_size = 600
dropout_rate = 0.15
weight_decay = 0
label_smoothing = 1e-2
learning_rate = 1e-3
verbose = 1

# Train-Test-Split Training

Split the train set into three folds, i.e., training-1, training-2 and validation sets. First, train the more on training-1 set and validate it on the validation set. Then use the training-2 set to find the best number of finetuning epochs. Finally, finetune on both training-2 and validation sets and submit.

In [19]:
# 데이터 증강
def aug(data, shift):
    shift_data = np.roll(data, shift, axis=2)
    return shift_data

# 모델 1번: Transformer

def build_transformer(split_num, train, target, test, rnd):
    start_time_fold = time()
    # return train pred prob and test pred prob 
    test_pred = np.zeros((test.shape[0], 61))

    ckp_path = 'JSTransformer.hdf5'

    rlr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 3, verbose = verbose, min_delta = 1e-4, mode = 'min')
    ckp = ModelCheckpoint(ckp_path, monitor = 'val_loss', verbose = 0, save_best_only = True, save_weights_only = True, mode = 'min')
    es = EarlyStopping(monitor = 'val_loss', min_delta = 1e-4, patience = 4, mode = 'min', baseline = None, restore_best_weights = True, verbose = 0)

    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in mskf.split(train, target):

        # split train, validation set
        X = train[train_idx]
        y = target[train_idx]
        valid_x = train[val_idx]
        valid_y = target[val_idx]

        #가벼운 모델 생성
        model = create_transformer_model(train.shape[2], 61, num_layers, d_model, num_heads, dff, window_size, dropout_rate, weight_decay, label_smoothing, learning_rate)

        model.fit(X, y, epochs = 100,
                  validation_data = (valid_x, valid_y),
                  batch_size = batch_size,
                  callbacks = [rlr, ckp, es],
                  verbose = verbose)
        
        # save feat
        model.load_weights(ckp_path)
        test_pred += model.predict(test)/split_num
        
        # release
        del model
        gc.collect()
        print('  ==============================================================================================  ')

        
    return test_pred

transformer_test = build_transformer(5, x, y, test, 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 29/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 29/100
Epoch 1/100
Epoch 

# Submitting

In [20]:
sample_submssion = pd.read_csv(path + 'sample_submission.csv')
sample_submssion.iloc[:,1:] = transformer_test
sample_submssion.to_csv("transformer.csv", index = False)
sample_submssion

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,3125,0.000004,3.199803e-06,5.679599e-06,0.000540,0.000351,0.000002,1.067405e-04,0.000001,0.000171,0.029167,1.819316e-02,0.210785,5.442244e-02,1.000304e-01,1.162669e-01,0.000413,5.373705e-07,7.817568e-05,3.283367e-05,5.963559e-05,1.570774e-05,1.583519e-06,0.000165,0.000759,2.887614e-01,0.000029,0.002046,5.954775e-06,0.000395,6.126476e-04,0.001438,0.000657,0.003723,0.000020,0.000262,1.342923e-06,0.000111,5.264769e-05,0.000924,0.000389,0.000024,0.000002,0.002101,0.001833,7.270281e-05,1.954279e-04,3.956714e-05,0.000257,0.017404,0.000370,0.000094,0.010349,6.601973e-02,3.972859e-05,8.774088e-06,0.000007,9.427548e-07,2.515463e-04,3.636388e-02,1.086393e-05,0.033550
1,3126,0.000457,7.930650e-06,7.342406e-06,0.001670,0.000012,0.000050,4.953320e-07,0.003520,0.000012,0.000332,5.847937e-07,0.000067,1.670109e-06,1.897715e-05,6.311990e-06,0.000726,1.085518e-03,1.277019e-06,7.844548e-05,5.235327e-07,1.519000e-05,6.711196e-04,0.000838,0.000710,4.792686e-06,0.000011,0.975872,4.170228e-06,0.000003,6.544556e-06,0.000253,0.000049,0.000080,0.000233,0.001613,6.929004e-05,0.000260,6.151832e-07,0.000005,0.000006,0.000654,0.000026,0.000392,0.000052,3.987473e-06,7.886456e-05,4.411476e-07,0.000014,0.003554,0.000089,0.000125,0.000025,5.362591e-06,6.733638e-05,3.943343e-04,0.000036,9.141508e-05,6.366154e-05,7.181816e-07,5.131632e-06,0.005565
2,3127,0.000951,2.363748e-02,1.436562e-06,0.000013,0.000002,0.000056,9.751473e-04,0.014894,0.014993,0.000004,7.243217e-07,0.000011,9.250743e-05,7.579114e-07,4.507687e-05,0.000140,3.351221e-04,4.093389e-05,6.890607e-06,2.871860e-06,1.890440e-07,8.199413e-06,0.000002,0.000003,6.015507e-06,0.000007,0.001512,3.752296e-04,0.000541,9.923330e-05,0.000003,0.000008,0.000385,0.000661,0.000125,8.378948e-06,0.000041,1.218698e-03,0.000528,0.000181,0.001109,0.000014,0.000089,0.000208,3.926054e-02,8.652668e-01,1.212111e-06,0.001988,0.014284,0.003481,0.000107,0.000026,1.384518e-04,2.717580e-07,1.062233e-02,0.000001,3.516010e-04,1.940584e-07,1.358634e-05,2.245386e-04,0.000897
3,3128,0.001685,1.445622e-06,5.516265e-05,0.000043,0.000024,0.000018,8.604532e-08,0.000209,0.000047,0.000001,6.892806e-07,0.000002,1.653091e-07,1.670446e-08,2.031162e-07,0.000113,3.537409e-05,3.424891e-07,1.761731e-05,6.727868e-08,6.628845e-08,1.615631e-04,0.000408,0.000040,8.882908e-07,0.000011,0.988166,5.227134e-07,0.000008,9.381392e-06,0.000086,0.000014,0.000530,0.000152,0.000093,2.485258e-06,0.000006,1.258721e-07,0.000002,0.000002,0.000002,0.000043,0.000057,0.000028,3.009510e-07,4.714276e-07,4.157913e-06,0.000002,0.000883,0.001989,0.004250,0.000002,9.195410e-07,2.587433e-06,3.836748e-06,0.000005,5.851774e-07,1.239584e-04,1.794896e-07,9.008100e-07,0.000655
4,3129,0.001282,1.087125e-06,7.811357e-07,0.000006,0.001859,0.000459,3.822749e-06,0.000208,0.000070,0.000063,9.578569e-06,0.000007,2.531105e-07,8.582837e-07,8.237320e-06,0.000211,5.440887e-06,2.060923e-06,1.816982e-03,2.860328e-07,2.958379e-06,1.968860e-05,0.011647,0.000009,6.877446e-06,0.000235,0.961295,4.541154e-07,0.000033,4.615864e-06,0.000286,0.000022,0.002564,0.000011,0.000199,5.231528e-07,0.000017,1.496658e-06,0.000005,0.000023,0.000010,0.000030,0.000002,0.000002,2.158898e-06,1.778826e-06,1.156243e-05,0.000010,0.001129,0.001543,0.014131,0.000001,4.921151e-07,6.991006e-07,1.641918e-07,0.000005,1.858679e-06,9.214650e-06,6.478103e-07,5.906661e-05,0.000653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,3902,0.004295,2.460296e-06,6.039345e-06,0.000006,0.000659,0.000039,8.003377e-07,0.000863,0.000103,0.000364,5.211375e-05,0.000037,1.021327e-06,5.597417e-07,1.083226e-05,0.002453,7.915429e-06,5.686478e-06,3.163069e-03,7.051581e-07,2.759230e-07,2.181601e-05,0.002383,0.000014,3.272359e-06,0.000143,0.974144,8.243088e-07,0.000080,1.253799e-05,0.000248,0.000036,0.004681,0.000012,0.001504,2.697787e-05,0.000042,5.827691e-07,0.000002,0.000027,0.000011,0.000011,0.000004,0.000003,2.359547e-06,1.236040e-06,3.265007e-05,0.000007,0.000237,0.001710,0.001272,0.000005,1.161548e-06,1.162374e-05,1.826842e-07,0.000006,5.595795e-07,1.481665e-05,7.298780e-07,4.678393e-05,0.001169
778,3903,0.022230,9.198171e-06,8.071564e-05,0.000288,0.002483,0.000127,7.835117e-06,0.001747,0.000094,0.000090,5.739951e-05,0.000014,2.365059e-06,1.823477e-06,8.734289e-06,0.007329,2.081825e-05,2.520397e-05,1.155536e-02,3.864913e-06,7.229333e-06,2.609942e-04,0.008220,0.000048,1.150680e-05,0.000127,0.939080,4.192576e-06,0.000039,8.053523e-07,0.000164,0.000062,0.001131,0.000020,0.000237,1.299454e-04,0.000074,7.802985e-06,0.000004,0.000041,0.000179,0.000011,0.000007,0.000002,1.245788e-06,6.632108e-07,5.206932e-05,0.000010,0.000329,0.000960,0.001821,0.000021,1.440705e-06,1.188457e-05,2.041688e-06,0.000028,1.784824e-05,7.061465e-05,3.157177e-06,3.162059e-04,0.000308
779,3904,0.000322,1.333728e-07,9.506515e-07,0.000013,0.000204,0.000043,1.011551e-06,0.000028,0.000028,0.000212,3.254090e-06,0.000023,3.983599e-07,9.722271e-07,2.104642e-06,0.000053,1.037531e-06,1.786107e-07,7.756500e-05,4.798668e-08,9.165767e-07,1.339309e-05,0.007310,0.000011,7.914059e-06,0.000018,0.984760,1.202762e-07,0.000017,5.162187e-06,0.000750,0.000023,0.001841,0.000005,0.000260,1.278571e-07,0.000011,7.008268e-07,0.000002,0.000013,0.000003,0.000002,0.000004,0.000003,3.066692e-07,1.009424e-06,4.535526e-06,0.000014,0.001172,0.000492,0.001342,0.000002,9.056953e-07,2.097317e-06,3.356830e-07,0.000005,1.960888e-07,7.883888e-06,2.226534e-07,5.031873e-06,0.000879
780,3905,0.000212,1.477733e-02,1.229445e-05,0.000007,0.000007,0.000037,3.718256e-02,0.000495,0.000134,0.000049,8.229397e-06,0.000003,1.187087e-05,1.331099e-05,7.726767e-07,0.000011,2.238695e-04,6.130533e-04,8.102581e-07,2.506042e-05,1.763265e-06,3.454903e-07,0.000005,0.000002,1.014279e-04,0.000050,0.002166,2.586949e-03,0.000007,3.180867e-04,0.000020,0.000005,0.000727,0.000148,0.000005,9.580469e-06,0.000001,8.555789e-01,0.000004,0.000001,0.001860,0.000661,0.000009,0.000006,4.602746e-04,2.687839e-03,1.804904e-05,0.065880,0.000422,0.001031,0.000004,0.000009,5.042593e-05,1.017475e-05,9.958289e-04,0.000010,1.770676e-03,1.285139e-06,1.494896e-05,8.475245e-03,0.000060


In [21]:
# https://www.kaggle.com/gogo827jz/jane-street-ffill-transformer-baseline
# https://wikidocs.net/31379
# https://www.tensorflow.org/tutorials/text/transformer