## Imports

In [1]:
# <hide-input>
%load_ext autoreload
%autoreload 2

In [2]:
# <hide-input>
from datetime import datetime
from functools import partial
import gc
import json
import math
import os
from pathlib import Path
import re
import subprocess
import sys
import time

import ipywidgets as widgets
from google.cloud import storage, bigquery
from google.cloud.bigquery import SchemaField
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm


AUTO = tf.data.experimental.AUTOTUNE
BUCKET = 'caleb-riiid'
DATASET = 'data'
LOCATION = 'europe-west4'
KAGGLE_SUBMIT_DATASET = 'riiid-submission-private'
PROJECT = 'fastai-caleb'
REPO = 'riiid_2020'
NOT_KAGGLE = os.getenv('KAGGLE_URL_BASE') is None

# if NOT_KAGGLE:
#     from google.colab import drive
#     DRIVE = Path('/content/drive/My Drive')
#     if not DRIVE.exists():
#         drive.mount(str(DRIVE.parent))
#     sys.path.append(str(DRIVE))
#     g_creds_path = 'credentials/riiid-caleb-faddd0c9d900.json'
#     os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(DRIVE/g_creds_path)

bucket = storage.Client(project=PROJECT).get_bucket(BUCKET)
dataset = bigquery.Dataset(f'{PROJECT}.{DATASET}')
bq_client = bigquery.Client(project=PROJECT, location=LOCATION)

if NOT_KAGGLE:
    CONFIG = json.loads(bucket.get_blob('config.json').download_as_string())
    os.environ = {**os.environ, **CONFIG}
    sys.path.append('/home/jupyter')
    from riiid_2020.bqhelpers import BQHelper
    from riiid_2020.queries import Queries

#     from comet_ml import APIExperiment, Experiment
#     from kaggle.api.kaggle_api_extended import KaggleApi
#     kaggle_api = KaggleApi()
#     kaggle_api.authenticate()

# import plotly
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
# from sklearn.metrics import roc_auc_score
# from sklearn.preprocessing import MultiLabelBinarizer
# # pd.options.plotting.backend = 'plotly'
# tqdm.pandas()

import matplotlib.pyplot as plt

## Classes

In [3]:
class LRFinder(tf.keras.callbacks.Callback):
    def __init__(self, start=1e-7, end=5, steps=100):
        self.losses = []
        self.start = start
        self.end = end
        self.steps = steps
        self.best_loss = np.inf

    def on_batch_begin(self, step, logs):
        scheduled_lr = self.start * (self.end / self.start) ** (step/self.steps)
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)

    def on_batch_end(self, step, logs):
        loss = logs.get('loss')
        self.losses.append(loss)

In [4]:
class OneCycleScheduler(tf.keras.callbacks.Callback):
    def __init__(self, total_steps=1000, steps_up_pct=0.3, steps_across_pct=.01, steps_down_pct=0.6, lr_max=.001,
              lr_start_factor=.00003, lr_end_factor=.00001, decay=0.93,
              mo_max=0.95, mo_min=0.85, verbose=1):
        
        self.step = -1
        self.epoch = -1

        super(OneCycleScheduler, self).__init__()

        def one_cycle(step):
            
            steps_up = int(total_steps * steps_up_pct)
            steps_across = int(total_steps * steps_across_pct)
            steps_down = int(total_steps * steps_down_pct)
            lr_start = lr_max * lr_start_factor
            lr_end = lr_max * lr_end_factor

            if step <= steps_up:
                new_lr = (lr_max - lr_start)/2  * (-math.cos((math.pi * step) / steps_up) + 1) + lr_start
                new_mo = (mo_max - mo_min)/2  * (math.cos((math.pi * step) / steps_up) + 1) + mo_min
            
            elif step <= (steps_up + steps_across):
                new_lr = lr_max
                new_mo = mo_min
            
            elif step <= (steps_up + steps_across + steps_down):
                down_step = step - steps_across - steps_up
                new_lr = (lr_max - lr_end)/2  * (math.cos((math.pi * down_step) / steps_down) + 1) + lr_end
                new_mo = (mo_max - mo_min)/2  * (-math.cos((math.pi * down_step) / steps_down) + 1) + mo_min

            else:
                new_lr = lr_end * decay**(step - steps_up - steps_across - steps_down)
                new_mo = mo_max
            
            return new_lr, new_mo

        self.schedule = one_cycle
        self.verbose = verbose
 
    def on_batch_begin(self, step, logs):
        self.step +=1
        scheduled_lr, scheduled_mo = self.schedule(self.step)
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        tf.keras.backend.set_value(self.model.optimizer.beta_1, scheduled_mo)
        
    def on_epoch_end(self, epoch, logs):
        if self.verbose:
            scheduled_lr, scheduled_mo = self.schedule(self.step)
            auc_roc = logs.get('val_auc_roc')
            auc_roc = auc_roc if auc_roc is not None else 0
            print(f'\nepoch {epoch+1:02d}: val_auc_roc={auc_roc:0.4f}, learning_rate={scheduled_lr:0.2e}, beta_1={scheduled_mo:0.3f}')
            
def plot_lr_sched(one_cycle, total_steps):
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=list(range(total_steps)),
                   y=[one_cycle.schedule(e)[0] for e in range(total_steps)],
                   name="lr"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=list(range(total_steps)),
                   y=[one_cycle.schedule(e)[1] for e in range(total_steps)],
                   name="mom"),
        secondary_y=True,
    )

    fig.update_layout(title_text="Learning Rate Schedule")
    fig.update_xaxes(title_text="steps")
    fig.update_yaxes(title_text="learning rate", secondary_y=False)
    fig.update_yaxes(title_text="momentum", secondary_y=True)

    fig.show()

## Strategy

In [5]:
def get_strategy():

    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver('tpu-4')
        print('Running on TPU ', tpu.master())
    except:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    
    else:
        strategy = tf.distribute.get_strategy()
        for d in tf.config.list_physical_devices():
            print(d)
            
    return strategy

strategy = get_strategy()

Running on TPU  grpc://10.46.150.234:8470
INFO:tensorflow:Initializing the TPU system: tpu-4


INFO:tensorflow:Initializing the TPU system: tpu-4


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


## Setup

In [6]:
dtypes = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'bool',
    'tid_orig': 'int16'
}

In [7]:
bqh = BQHelper(bucket, DATASET, bq_client)

In [8]:
folds = range(5)

## Datasets

In [9]:
df_trn_attn = bqh.get_df_query_bqs(f"""
    WITH t AS (
        SELECT fold, timestamp, user_id, content_id + 1 content_id, answered_correctly + 1 answered_correctly,
        MAX(task_container_id) OVER(PARTITION BY user_id) tid_max
        FROM data.train
        WHERE content_type_id = 0
    )
    SELECT fold, timestamp, user_id, content_id, answered_correctly
    FROM t
    WHERE tid_max > 9
    AND fold in ({(',').join(map(str,folds))});
""",'df_trn_attn.pkl', from_bq=False, dtypes=dtypes)

In [10]:
def get_seq(exer, resp, max_seq_len=16):
    pad_len = tf.maximum(max_seq_len - tf.shape(exer)[0], 0)
    t_exer = tf.concat([tf.zeros(pad_len, dtype=tf.int64), tf.constant(exer[:max_seq_len], dtype=tf.int64)], axis=0)
    t_resp = tf.concat([tf.zeros(pad_len, dtype=tf.int64), tf.constant(resp[:max_seq_len], dtype=tf.int64)], axis=0)
    return t_exer, t_resp

In [11]:
@tf.function
def seq_fix_len(exer, resp, len_seq=100):
    pad = tf.maximum(len_seq - tf.shape(exer)[0], 0)
    exer = tf.pad(exer[-len_seq:], [[pad, 0]])
    resp = tf.pad(resp[-len_seq:], [[pad, 0]])
    return exer, resp

In [12]:
@tf.function
def add_mask_to_batch(exer, resp):
    seq = tf.cast(tf.math.equal(exer, 0), tf.float32)
    mask_pad = seq[:, tf.newaxis, tf.newaxis, :]
        
    seq_size = tf.shape(resp)[1]
    mask_future = 1 - tf.linalg.band_part(tf.ones((seq_size, seq_size)), -1, 0)
    
    mask_current = tf.maximum(mask_pad, mask_future)
    mask_prior = tf.maximum(mask_pad[:, :, :, :-1], mask_future[:-1, :-1])
    
    label = tf.cast(tf.math.equal(resp[:, -1], 2), tf.float32)
    
    resp_prior = tf.identity(resp)[:, :-1]
            
    return exer, resp_prior, mask_current, mask_prior, label

### TFRecords 

In [13]:
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    
    if type(value) != type(list()):
        value = [value]

    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [14]:
def serialize_example(group):
    
    feature = {
        'content_id': _int64_feature(list(group[0])),
        'answered_correctly': _int64_feature(list(group[1]))
    }

    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()

In [15]:
def parse_example(example, len_seq_max=20):
    
    features = {'content_id': tf.io.VarLenFeature(tf.int64),
                'answered_correctly': tf.io.VarLenFeature(tf.int64)
                }

    example = tf.io.parse_single_example(example, features)

    return (example['content_id'].values, example['answered_correctly'].values)

In [16]:
if False:

    for f in folds:
        groups = (df_trn_attn[df_trn_attn.fold == f][['user_id', 'content_id', 'answered_correctly']]
                  .groupby('user_id').apply(lambda r: (r['content_id'].values,
                                                       r['answered_correctly'].values)))
        n_files = len(groups) // int(3e3)

        for i, split in enumerate(np.array_split(groups, n_files)):
            out_path = f'gs://{BUCKET}/tfrecords'
            filename = f'sequence-{f}-{i:02d}-{len(split)}.tfrec'
            record_file = f'{out_path}/{filename}'

            with tf.io.TFRecordWriter(record_file) as writer:
                for g in tqdm(split.to_numpy()):
                    writer.write(serialize_example(g))

### Datasets from TFRecords

In [17]:
def get_ds_tfrec(folds, len_seq=20, batch_size=64, repeat=True, shuffle=1000):
    file_pat = 'gs://{BUCKET}/tfrecords/sequence-{f}*.tfrec'
    file_pats = [file_pat.format(BUCKET=BUCKET,f=f) for f in folds]
    options = tf.data.Options()
    
    ds = (tf.data.Dataset.list_files(file_pats, shuffle=True)
          .with_options(options)
          .interleave(tf.data.TFRecordDataset, num_parallel_calls=AUTO)
          .map(parse_example, num_parallel_calls=AUTO)
          .map(partial(seq_fix_len, len_seq=len_seq), num_parallel_calls=AUTO)
          )
    
    if shuffle is not None:
        ds = ds.shuffle(shuffle)
    
    ds = ds.repeat() if repeat else ds
    ds = ds.batch(batch_size).map(add_mask_to_batch, num_parallel_calls=AUTO)
    
    return ds.prefetch(AUTO)

### Dataset from DataFrames

In [18]:
def get_ds(df, batch_size=1024, repeat=True):
    row_id = df.pop('row_id')
    y = df.pop('answered_correctly')   
    ds = tf.data.Dataset.from_tensor_slices(df, y)
    ds = ds.shuffle(int(5e6))
    ds = ds.repeat() if repeat else ds
    ds = ds.batch(batch_size)
    return ds.prefetch(AUTO)

In [19]:
def get_ds_df(df, len_seq, batch_size, repeat=True, shuffle=None):
    group = (df[['user_id', 'content_id', 'answered_correctly']]
             .groupby('user_id').apply(lambda r: ( r['content_id'].values,
                                              r['answered_correctly'].values)))
    
    ex_seq, res_seq = zip(*map(partial(get_seq, max_seq_len=len_seq), group))
    t_slices = (tf.stack(ex_seq, axis=0), tf.stack(res_seq, axis=0))
    ds = tf.data.Dataset.from_tensor_slices(t_slices)
    
    if shuffle is not None:
        ds = ds.shuffle(shuffle)
    
    ds = ds.repeat() if repeat else ds
    ds = ds.batch(batch_size).map(add_mask_to_batch, num_parallel_calls=AUTO)
    
    return ds.prefetch(AUTO)

## Model

In [20]:
with strategy.scope():
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        return pos * angle_rates

    def positional_encoding(position, d_model):
        angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                              np.arange(d_model)[np.newaxis, :],
                              d_model)

        # apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]

        return tf.cast(pos_encoding, dtype=tf.float32)

In [21]:
with strategy.scope():
    def scaled_dot_product_attention(q, k, v, mask):
        """Calculate the attention weights.
        q, k, v must have matching leading dimensions.
        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
        The mask has different shapes depending on its type(padding or look ahead) 
        but it must be broadcastable for addition.

        Args:
        q: query shape == (..., seq_len_q, depth)
        k: key shape == (..., seq_len_k, depth)
        v: value shape == (..., seq_len_v, depth_v)
        mask: Float tensor with shape broadcastable 
              to (..., seq_len_q, seq_len_k). Defaults to None.

        Returns:
        output, attention_weights
        """

        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)  

        # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1.
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

        return output, attention_weights

In [22]:
#what gets returned are the weighted values for each word in the sequence along
#with the weights that were applied to the values to get the weighted values
#the weights were determined by multiplying the embedding representation of each
#word three trainable matrices, q, k, v, and then multiplying q x k.
# embed x (Q, K, V) --> q, k, v --> weights = softmax(q x k)/scaled  --> weighted values = weights * values

In [23]:
def print_out(q, k, v):
    temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
    print ('Attention weights are:')
    print (temp_attn)
    print ('Output is:')
    print (temp_out)

In [24]:
np.set_printoptions(suppress=True)

temp_k = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

Attention weights are:
tf.Tensor([[0. 1. 0. 0.]], shape=(1, 4), dtype=float32)
Output is:
tf.Tensor([[10.  0.]], shape=(1, 2), dtype=float32)


In [25]:
with strategy.scope():
    class MultiHeadAttention(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads):
            super(MultiHeadAttention, self).__init__()
            self.num_heads = num_heads
            self.d_model = d_model

            assert d_model % self.num_heads == 0

            self.depth = d_model // self.num_heads

            self.wq = tf.keras.layers.Dense(d_model)
            self.wk = tf.keras.layers.Dense(d_model)
            self.wv = tf.keras.layers.Dense(d_model)

            self.dense = tf.keras.layers.Dense(d_model)

        def split_heads(self, x, batch_size):
            """Split the last dimension into (num_heads, depth).
            Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
            """
            x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
            return tf.transpose(x, perm=[0, 2, 1, 3])

        def call(self, v, k, q, mask):
            batch_size = tf.shape(q)[0]

            q = self.wq(q)  # (batch_size, seq_len, d_model)
            k = self.wk(k)  # (batch_size, seq_len, d_model)
            v = self.wv(v)  # (batch_size, seq_len, d_model)

            # (batch_size, num_heads, seq_len_q, depth)
            q = self.split_heads(q, batch_size)
            # (batch_size, num_heads, seq_len_k, depth)
            k = self.split_heads(k, batch_size)
            # (batch_size, num_heads, seq_len_v, depth)
            v = self.split_heads(v, batch_size)

            # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
            # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
            scaled_attention, attention_weights = scaled_dot_product_attention(
                q, k, v, mask)

            # (batch_size, seq_len_q, num_heads, depth)
            scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

            concat_attention = tf.reshape(scaled_attention,
                                          (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

            # (batch_size, seq_len_q, d_model)
            output = self.dense(concat_attention)

            return output, attention_weights

In [26]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)

# d_model = 512, depth = 64
# split d_model weights in to num_heads x d_model/depth matrices
# calc output and weights for each head
# weights for each head include a weight for each position in the sequence for 
# every position in the sequence
# concat the output back together to have the same depth as the model again
# so you essentially have 8 sets of output for each position in the sequence
# and then run that through a dense layer to decide how to weight each of the d_model
# values returned in the concat

out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

In [27]:
#dff means depth of feadforward
with strategy.scope():
    def point_wise_feed_forward_network(d_model, dff):
        return tf.keras.Sequential([
          tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
          tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
        ])

In [28]:
with strategy.scope():
    class EncoderLayer(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads, dff, rate=0.1):
            super(EncoderLayer, self).__init__()

            self.mha = MultiHeadAttention(d_model, num_heads)
            self.ffn = point_wise_feed_forward_network(d_model, dff)

            self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

            self.dropout1 = tf.keras.layers.Dropout(rate)
            self.dropout2 = tf.keras.layers.Dropout(rate)

        def call(self, exer, mask, training):

            # (batch_size, input_seq_len, d_model)
            attn_output, _ = self.mha(exer, exer, exer, mask)
            attn_output = self.dropout1(attn_output, training=training)
            # (batch_size, input_seq_len, d_model)
            out1 = self.layernorm1(exer + attn_output)

            ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
            ffn_output = self.dropout2(ffn_output, training=training)
            # (batch_size, input_seq_len, d_model)
            out2 = self.layernorm2(out1 + ffn_output)

            return out2

In [29]:
with strategy.scope():
    class DecoderLayer(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads, dff, rate=0.1):
            super(DecoderLayer, self).__init__()

            self.mha1 = MultiHeadAttention(d_model, num_heads)
            self.mha2 = MultiHeadAttention(d_model, num_heads)

            self.ffn = point_wise_feed_forward_network(d_model, dff)

            self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

            self.dropout1 = tf.keras.layers.Dropout(rate)
            self.dropout2 = tf.keras.layers.Dropout(rate)
            self.dropout3 = tf.keras.layers.Dropout(rate)

        def call(self, resp_prior, enc_output, mask_prior, training):
            # enc_output.shape == (batch_size, input_seq_len, d_model)

            # (batch_size, target_seq_len, d_model)
            attn1, attn_weights_block1 = self.mha1(resp_prior, resp_prior, resp_prior, mask_prior)
            attn1 = self.dropout1(attn1, training=training)
            out1 = self.layernorm1(attn1 + resp_prior)

            # no need to include the mask here as it has already been included
            # in the generation of the encoder output and the first decoder layer
            attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, None)  # (batch_size, target_seq_len, d_model)
            attn2 = self.dropout2(attn2, training=training)
            # (batch_size, target_seq_len, d_model)
            out2 = self.layernorm2(attn2 + out1)

            ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
            ffn_output = self.dropout3(ffn_output, training=training)
            # (batch_size, target_seq_len, d_model)
            out3 = self.layernorm3(ffn_output + out2)

            return out3, attn_weights_block1, attn_weights_block2

In [30]:
with strategy.scope():
    class Encoder(tf.keras.layers.Layer):
        def __init__(self, num_layers, d_model, num_heads, dff, exer_size,
                     maximum_position_encoding, rate=0.1):
            super(Encoder, self).__init__()

            self.d_model = d_model
            self.num_layers = num_layers

            self.embedding = tf.keras.layers.Embedding(exer_size, d_model)
            self.pos_encoding = positional_encoding(maximum_position_encoding,
                                                    self.d_model)

            self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                               for _ in range(num_layers)]

            self.dropout = tf.keras.layers.Dropout(rate)

        def call(self, exer, mask, training):

            seq_len = tf.shape(exer)[1]

            # adding embedding and position encoding.
            x = self.embedding(exer)  # (batch_size, input_seq_len, d_model)
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            x += self.pos_encoding[:, :seq_len, :]

            x = self.dropout(x, training=training)

            for i in range(self.num_layers):
                x = self.enc_layers[i](x, mask, training)

            return x  # (batch_size, input_seq_len, d_model)

In [31]:
with strategy.scope():
    class Decoder(tf.keras.layers.Layer):
        def __init__(self, num_layers, d_model, num_heads, dff, resp_size,
                     maximum_position_encoding, rate=0.1):
            super(Decoder, self).__init__()

            self.d_model = d_model
            self.num_layers = num_layers

            self.embedding = tf.keras.layers.Embedding(resp_size, d_model)
            self.pos_encoding = positional_encoding(
                maximum_position_encoding, d_model)

            self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                               for _ in range(num_layers)]
            self.dropout = tf.keras.layers.Dropout(rate)

        def call(self, resp_prior, enc_output, mask_prior, training):

            seq_len = tf.shape(resp_prior)[1]
            attention_weights = {}

            x = self.embedding(resp_prior)  # (batch_size, target_seq_len, d_model)
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            x += self.pos_encoding[:, :seq_len, :]

            x = self.dropout(x, training=training)

            for i in range(self.num_layers):
                x, block1, block2 = self.dec_layers[i](x, enc_output, mask_prior, training)

                attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
                attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

            # x.shape == (batch_size, target_seq_len, d_model)
            return x, attention_weights

In [32]:
with strategy.scope():
    class Transformer(tf.keras.Model):
        def __init__(self, num_layers, d_model, num_heads, dff, exer_size,
                     resp_size, pe_input, pe_target, rate=0.1):
            super(Transformer, self).__init__()

            self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                                   exer_size, pe_input, rate)

            self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                                   resp_size, pe_target, rate)

            self.final_layer = tf.keras.layers.Dense(1)

        def call(self, exer, resp_prior, mask_current, mask_prior, training):

            # (batch_size, inp_seq_len, d_model)
            enc_output = self.encoder(exer, mask_current, training)

            # dec_output.shape == (batch_size, tar_seq_len, d_model)
            dec_output, attention_weights = self.decoder(resp_prior, enc_output, mask_prior, training)

            # (batch_size, tar_seq_len, 1)
            final_output = self.final_layer(dec_output)

    #         final_final_output = self.final_final_layer(tf.reshape(final_output, shape[:-1]))

            return tf.reduce_sum(final_output, -2), attention_weights

In [33]:
with strategy.scope():
    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, d_model, warmup_steps=4000):
            super(CustomSchedule, self).__init__()

            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)

            self.warmup_steps = warmup_steps

        def __call__(self, step):
            arg1 = tf.math.rsqrt(step)
            arg2 = step * (self.warmup_steps ** -1.5)

            return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [34]:
def loss_function(resp, pred):
    resp_current = resp[:, 1:]
    resp_binary = tf.cast(tf.math.equal(resp_current, 2), tf.float32)
    
    mask_current = tf.cast(tf.math.not_equal(resp_current, 0), tf.float32)
    loss_ = loss_object(tf.expand_dims(resp_binary, -1), tf.expand_dims(pred, -1))
    loss_ *= mask_current

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask_current)

In [35]:
def loss_function(resp, pred):
    resp_current = resp[:, -1]
    resp_binary = tf.cast(tf.math.equal(resp_current, 2), tf.float32)
    
    loss = loss_object(resp_binary, pred)

    return tf.reduce_sum(loss)

In [37]:
1536/8/8

24.0

## Hyperparameters

In [38]:
EXER_SIZE = df_trn_attn.content_id.max() + 1
RESP_SIZE = df_trn_attn.answered_correctly.max() + 1
LEN_SEQ = 128

num_layers = 8
d_model = 512
dff = 2048
num_heads = 8
dropout_rate = 0.5

BATCH_SIZE = 64 * strategy.num_replicas_in_sync
EPOCHS = 2
folds_trn, folds_val = folds[1:], folds[:1]

num_ex_trn = df_trn_attn.fold.isin(folds_trn).sum()
num_ex_val = df_trn_attn.fold.isin(folds_val).sum()
steps_per_epoch = num_ex_trn // BATCH_SIZE
steps_val = num_ex_val // BATCH_SIZE
steps_per_epoch

9684

## TPU Training

In [39]:
with strategy.scope():
    transformer = Transformer(num_layers, d_model, num_heads, dff,
                              EXER_SIZE, RESP_SIZE, 
                              pe_input=10000, 
                              pe_target=10000,
                              rate=dropout_rate)

    learning_rate = CustomSchedule(d_model)

    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                         epsilon=1e-9)

    loss_function = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
    
    splits = ['train', 'val']
    metrics = {s: {} for s in splits}
    metric_fns = {
        'loss': tf.keras.metrics.Mean,
        'accuracy': tf.keras.metrics.BinaryAccuracy,
        'auc': tf.keras.metrics.AUC
    }
    
    for s in splits:
        for m in metric_fns:
            metrics[s][m] = metric_fns[m](name=f'{s}_{m}')

In [40]:
@tf.function
def train_step(transformer, optimizer, loss_function, metrics, ds_iter, steps_per_epoch):
    def train_step_fn(exer, resp_prior, mask_current, mask_prior, label):

        with tf.GradientTape() as tape:
            pred, _ = transformer(exer, resp_prior, mask_current, mask_prior, True)
            loss = loss_function(label, pred)
#             loss = tf.nn.compute_average_loss(loss, global_batch_size=BATCH_SIZE)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        pred_prob = tf.math.sigmoid(pred)
        
        for n, m in metrics.items():
            if n == 'loss':
                m(loss)
            else:
                m(label, pred_prob)
        
    for _ in tf.range(steps_per_epoch):
        strategy.run(train_step_fn, next(ds_iter))
        
@tf.function
def val_step(transformer, loss_function, metrics, ds_iter, steps_per_epoch):
    def val_step_fn(exer, resp_prior, mask_current, mask_prior, label):
        pred, _ = transformer(exer, resp_prior, mask_current, mask_prior, False)
        loss = loss_function(label, pred)
        
        pred_prob = tf.math.sigmoid(pred)
        
        for n, m in metrics.items():
            if n == 'loss':
                m(loss)
            else:
                m(label, pred_prob)
        
    for _ in tf.range(steps_per_epoch):
        strategy.run(val_step_fn, next(ds_iter))        

In [41]:
ds_trn = get_ds_tfrec([0], LEN_SEQ, BATCH_SIZE)
ds_trn_iter = iter(strategy.experimental_distribute_dataset(ds_trn))

ds_val = get_ds_tfrec([1], LEN_SEQ, BATCH_SIZE)
ds_val_iter = iter(strategy.experimental_distribute_dataset(ds_val))

results = {s: {m: [] for m in metrics[s]} for s in splits}

In [42]:
for epoch in range(10):
    start = time.perf_counter()
    train_step(transformer, optimizer, loss_function, metrics['train'], ds_trn_iter, 100)
    val_step(transformer, loss_function, metrics['val'], ds_val_iter, 100)
    
    epoch_results = [f'{epoch + 1}']
    for s, m_dict in metrics.items():
        for m in m_dict:
            result = m_dict[m].result()
            epoch_results.append(f'{s}_{m}: {result:0.4f}')
            results[s][m].append(result)
            m_dict[m].reset_states()
    
    epoch_results.append(f'time: {time.perf_counter() - start:0.1f} sec.')
    print((', ').join(epoch_results))

1, train_loss: 16.5610, train_accuracy: 0.5113, train_auc: 0.5108, val_loss: 39.4261, val_accuracy: 0.5366, val_auc: 0.5000, time: 2316.2 sec.
2, train_loss: 6.9257, train_accuracy: 0.5145, train_auc: 0.5131, val_loss: 14.4494, val_accuracy: 0.5371, val_auc: 0.5000, time: 1899.6 sec.
3, train_loss: 4.7807, train_accuracy: 0.5201, train_auc: 0.5198, val_loss: 23.0202, val_accuracy: 0.5374, val_auc: 0.5000, time: 1899.5 sec.


KeyboardInterrupt: 

## CPU Training

In [30]:
cpu_training = False

In [31]:
if cpu_training:
    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)
    loss_function = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')
    train_auc = tf.keras.metrics.AUC(name='train_auc')
       
    plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
    plt.ylabel("Learning Rate")
    plt.xlabel("Train Step")

In [32]:
if cpu_training:
    checkpoint_path = "./checkpoints/train"

    ckpt = tf.train.Checkpoint(transformer=transformer,
                               optimizer=optimizer)

    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print ('Latest checkpoint restored!!')

In [33]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

if cpu_training:
    train_step_signature = [
        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
        tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32),
        tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32)
    ]


    @tf.function(input_signature=train_step_signature)
    def train_step(exer, resp, mask_pad, mask_future):
        resp_current = resp[:, -1]
        resp_binary = tf.cast(tf.math.equal(resp_current, 2), tf.float32)

        with tf.GradientTape() as tape:
            pred, _ = transformer(exer, resp, mask_pad, mask_future, True)
            loss = loss_function(resp_binary, pred)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        pred_prob = tf.math.sigmoid(pred)
        train_loss(loss)
        train_accuracy(resp_binary, pred_prob)
        train_auc(resp_binary, pred_prob)

In [34]:
if cpu_training:
    results = {m: [] for m in ['loss', 'accuracy', 'duration']}
    ds_trn = get_ds(df_trn_attn[df_trn_attn.fold == 0], len_seq=LEN_SEQ, batch_size=BATCH_SIZE)
    b = next(iter(ds_trn))

In [35]:
if cpu_training:
    transformer = Transformer(num_layers, d_model, num_heads, dff,
                              EXER_SIZE, RESP_SIZE, 
                              pe_input=10000, 
                              pe_target=10000,
                              rate=dropout_rate)
    
    pred, _ = transformer(*b, True)

    print(transformer.summary())

In [36]:
if cpu_training:
    for epoch in range(EPOCHS):
        start = time.time()

        train_loss.reset_states()
        train_accuracy.reset_states()
        train_auc.reset_states()

        # inp -> portuguese, tar -> english
        for (batch, (exer, resp, mask_pad, mask_future)) in enumerate(ds_trn):
            train_step(exer, resp, mask_pad, mask_future)

            if batch % 50 == 0:
                print('Batch {} Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(batch,
                        train_loss.result(), train_accuracy.result(), train_auc.result()))
                
            if batch == 1000:
                break

#         if (epoch + 1) % 5 == 0:
#             ckpt_save_path = ckpt_manager.save()
#             print('Saving checkpoint for epoch {} at {}'.format(epoch+1,
#                                                                 ckpt_save_path))

        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))

        print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))