In [None]:
# <hide-input>
%load_ext autoreload
%autoreload 2

In [None]:
# <hide-input>
from datetime import datetime
import gc
import json
import math
import os
from pathlib import Path
import re
import subprocess
import sys
import time

import ipywidgets as widgets
from google.cloud import storage, bigquery
from google.cloud.bigquery import SchemaField
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm


AUTO = tf.data.experimental.AUTOTUNE
BUCKET = 'caleb-riiid'
DATASET = 'data'
LOCATION = 'europe-west4'
KAGGLE_SUBMIT_DATASET = 'riiid-submission-private'
PROJECT = 'fastai-caleb'
REPO = 'riiid_2020'
NOT_KAGGLE = os.getenv('KAGGLE_URL_BASE') is None

# if NOT_KAGGLE:
#     from google.colab import drive
#     DRIVE = Path('/content/drive/My Drive')
#     if not DRIVE.exists():
#         drive.mount(str(DRIVE.parent))
#     sys.path.append(str(DRIVE))
#     g_creds_path = 'credentials/riiid-caleb-faddd0c9d900.json'
#     os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(DRIVE/g_creds_path)

bucket = storage.Client(project=PROJECT).get_bucket(BUCKET)
dataset = bigquery.Dataset(f'{PROJECT}.{DATASET}')
bq_client = bigquery.Client(project=PROJECT, location=LOCATION)

if NOT_KAGGLE:
    CONFIG = json.loads(bucket.get_blob('config.json').download_as_string())
    os.environ = {**os.environ, **CONFIG}
    sys.path.append('/home/jupyter')
    from riiid_2020.bqhelpers import BQHelper
    from riiid_2020.queries import Queries

    from comet_ml import APIExperiment, Experiment
    from kaggle.api.kaggle_api_extended import KaggleApi
    kaggle_api = KaggleApi()
    kaggle_api.authenticate()

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
pd.options.plotting.backend = 'plotly'
tqdm.pandas()

In [None]:
class LRFinder(tf.keras.callbacks.Callback):
    def __init__(self, start=1e-7, end=5, steps=100):
        self.losses = []
        self.start = start
        self.end = end
        self.steps = steps
        self.best_loss = np.inf

    def on_batch_begin(self, step, logs):
        scheduled_lr = self.start * (self.end / self.start) ** (step/self.steps)
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)

    def on_batch_end(self, step, logs):
        loss = logs.get('loss')
        self.losses.append(loss)

In [None]:
class OneCycleScheduler(tf.keras.callbacks.Callback):
    def __init__(self, total_steps=1000, steps_up_pct=0.3, steps_across_pct=.01, steps_down_pct=0.6, lr_max=.001,
              lr_start_factor=.00003, lr_end_factor=.00001, decay=0.93,
              mo_max=0.95, mo_min=0.85, verbose=1):
        
        self.step = -1
        self.epoch = -1

        super(OneCycleScheduler, self).__init__()

        def one_cycle(step):
            
            steps_up = int(total_steps * steps_up_pct)
            steps_across = int(total_steps * steps_across_pct)
            steps_down = int(total_steps * steps_down_pct)
            lr_start = lr_max * lr_start_factor
            lr_end = lr_max * lr_end_factor

            if step <= steps_up:
                new_lr = (lr_max - lr_start)/2  * (-math.cos((math.pi * step) / steps_up) + 1) + lr_start
                new_mo = (mo_max - mo_min)/2  * (math.cos((math.pi * step) / steps_up) + 1) + mo_min
            
            elif step <= (steps_up + steps_across):
                new_lr = lr_max
                new_mo = mo_min
            
            elif step <= (steps_up + steps_across + steps_down):
                down_step = step - steps_across - steps_up
                new_lr = (lr_max - lr_end)/2  * (math.cos((math.pi * down_step) / steps_down) + 1) + lr_end
                new_mo = (mo_max - mo_min)/2  * (-math.cos((math.pi * down_step) / steps_down) + 1) + mo_min

            else:
                new_lr = lr_end * decay**(step - steps_up - steps_across - steps_down)
                new_mo = mo_max
            
            return new_lr, new_mo

        self.schedule = one_cycle
        self.verbose = verbose
 
    def on_batch_begin(self, step, logs):
        self.step +=1
        scheduled_lr, scheduled_mo = self.schedule(self.step)
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        tf.keras.backend.set_value(self.model.optimizer.beta_1, scheduled_mo)
        
    def on_epoch_end(self, epoch, logs):
        if self.verbose:
            scheduled_lr, scheduled_mo = self.schedule(self.step)
            auc_roc = logs.get('val_auc_roc')
            auc_roc = auc_roc if auc_roc is not None else 0
            print(f'\nepoch {epoch+1:02d}: val_auc_roc={auc_roc:0.4f}, learning_rate={scheduled_lr:0.2e}, beta_1={scheduled_mo:0.3f}')
            
def plot_lr_sched(one_cycle, total_steps):
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=list(range(total_steps)),
                   y=[one_cycle.schedule(e)[0] for e in range(total_steps)],
                   name="lr"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=list(range(total_steps)),
                   y=[one_cycle.schedule(e)[1] for e in range(total_steps)],
                   name="mom"),
        secondary_y=True,
    )

    fig.update_layout(title_text="Learning Rate Schedule")
    fig.update_xaxes(title_text="steps")
    fig.update_yaxes(title_text="learning rate", secondary_y=False)
    fig.update_yaxes(title_text="momentum", secondary_y=True)

    fig.show()

In [None]:
def get_strategy():

    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver('tpu-1')
        print('Running on TPU ', tpu.master())
    except:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.TPUStrategy(tpu)
    
    else:
        strategy = tf.distribute.get_strategy()
        for d in tf.config.list_physical_devices():
            print(d)
            
    return strategy

strategy = get_strategy()

In [None]:
folds = range(40)

if True:
    df_data = pd.read_gbq((f"""
        WITH data AS (
        SELECT row_id, user_id, content_id, part, tags_array, answered_correctly,
        ROW_NUMBER() OVER(PARTITION BY user_id, content_id ORDER BY timestamp DESC) row_num
        FROM {DATASET}.train t
        JOIN {DATASET}.folds f
        ON t.user_id = f.user_id_s
        JOIN {DATASET}.content_tags c
        ON t.ql_id = c.ql_id
        WHERE content_type_id = 0
        AND t.task_container_id < f.task_container_id_min
        AND f.fold in ({(', '.join(list(map(str, folds))))}) 
        )
        SELECT row_id, user_id, content_id, part, tags_array, answered_correctly
        FROM data
        WHERE row_num = 1
        """, '_q_'), use_bqstorage_api=True, progress_bar_type='tqdm_notebook')        

In [None]:
if False:
    df_data = pd.read_gbq((f"""
        WITH data AS (
        SELECT row_id, user_id, content_id, answered_correctly,
        ROW_NUMBER() OVER(PARTITION BY user_id, content_id ORDER BY timestamp DESC) row_num
        FROM {DATASET}.train t
        JOIN {DATASET}.folds f
        ON t.user_id = f.user_id_s
        WHERE content_type_id = 0
        )
        SELECT row_id, user_id, content_id, answered_correctly
        FROM data
        WHERE row_num = 1
        """, '_q_'), use_bqstorage_api=True, progress_bar_type='tqdm_notebook')

In [None]:
folds = [0,1]

if True:
    df_test = pd.read_gbq((f"""
        SELECT row_id, user_id, content_id, part, tags_array, answered_correctly,
        FROM {DATASET}.train t
        JOIN {DATASET}.content_tags c
        ON t.ql_id = c.ql_id
        WHERE content_type_id = 0
        AND fold in ({(', '.join(list(map(str, folds))))}) 
        ORDER BY user_id, content_id
        """, '_q_'), use_bqstorage_api=True, progress_bar_type='tqdm_notebook')

In [None]:
# keeping out unseen users and questions to start with
df_test = df_test[df_test.user_id.isin(df_data.user_id)
                    & df_test.content_id.isin(df_data.content_id)]

In [None]:
df_ct = bq_client.query('select * from data.content_tags where question_id is not null').to_dataframe()
df_ct.question_id = df_ct.question_id.astype('category')
df_ct = df_ct.set_index('question_id')

In [None]:
def fix_len(r):
    new = np.repeat(188, 6)
    new[range(len(r))] = r
    return new

In [None]:
df_ct['tags'] = df_ct.tags_array.progress_apply(fix_len)

In [None]:
all_tags = np.unique(np.concatenate(df_ct.tags_array))

In [None]:
tag_cols = [f'tag_{t}' for t in range(6)]

In [None]:
df_tags = pd.DataFrame(df_ct.tags.to_list(), columns=tag_cols)

In [None]:
cat_cols = ['user_id', 'content_id', 'part']

In [None]:
def categorify(df_data, df_test, df_tags, cat_cols, tag_cols):
    df_data = df_data.merge(df_tags, how='left', left_on='content_id', right_index=True)
    df_test = df_test.merge(df_tags, how='left', left_on='content_id', right_index=True)

    for col in cat_cols:        
        df_data[col] = df_data[col].astype('category')
        df_test[col] = pd.Categorical(df_test[col], df_data[col].cat.categories)

    for t in tag_cols:
        df_data[t] = pd.Categorical(df_data[t], all_tags)
        df_test[t] = pd.Categorical(df_test[t], all_tags)
        
    for col in cat_cols + tag_cols:
        df_data[col] = df_data[col].cat.codes
        df_test[col] = df_test[col].cat.codes
    
    train_cols = ['row_id'] + cat_cols + tag_cols + ['answered_correctly']

    return df_data[train_cols].sample(frac=1, random_state=42), df_test[train_cols]

In [None]:
df_train, df_valid = categorify(df_data, df_test, df_tags, cat_cols, tag_cols)

## Tensorflow Collaborative Filtering

### Dataset from DataFrames

In [None]:
def get_ds(df, batch_size=1024, repeat=True):
    row_id = df.pop('row_id')
    y = df.pop('answered_correctly')   
    ds = tf.data.Dataset.from_tensor_slices(df, y)
    ds = ds.shuffle(int(5e6))
    ds = ds.repeat() if repeat else ds
    ds = ds.batch(batch_size)
    return ds.prefetch(AUTO)

### Dataset from TFRecords

In [None]:
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    
    if type(value) != type(list()):
        value = [value]

    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [None]:
def serialize_example(rec):
    
    feature = {
        'row_id': _int64_feature([rec[0]]),
        'features': _int64_feature(list(rec[1:-1])),
        'target': _int64_feature([rec[-1]])
    }

    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()

In [None]:
def parse_example(example, test=False):
    features = {'row_id': tf.io.FixedLenFeature([], tf.int64),
                'features': tf.io.FixedLenFeature([9], tf.int64),
                'target': tf.io.FixedLenFeature([], tf.int64),
                }

    example = tf.io.parse_single_example(example, features)

    return example['features'], example['target']

In [None]:
def get_ds_tfrec(split, batch_size=4096, repeat=True):
    file_pat = f'gs://{BUCKET}/tfrecords/deep_collab-{split}*.tfrec'
    options = tf.data.Options()
    
    ds = (tf.data.Dataset.list_files(file_pat, shuffle=True)
          .with_options(options)
          .interleave(tf.data.TFRecordDataset, num_parallel_calls=AUTO)
          .map(parse_example, num_parallel_calls=AUTO)
          )
    
    ds = ds.shuffle(int(1e6))
    ds = ds.repeat() if repeat else ds
    ds = ds.batch(batch_size)
    return ds.prefetch(AUTO)

In [None]:
parse_example(serialize_example(df_valid.head().to_numpy()[0]))

In [None]:
if False:

    split_dict = {
        'train': df_train,
        'valid': df_valid
    }

    for s in split_dict:
        df = split_dict[s]
        n_files = len(df) // int(1e6)

        for i, split in enumerate(np.array_split(df, n_files)):
            out_path = f'gs://{BUCKET}/tfrecords'
            filename = f'deep_collab-{s}-{i:02d}-{len(split)}.tfrec'
            record_file = f'{out_path}/{filename}'

            with tf.io.TFRecordWriter(record_file) as writer:
                for rec in tqdm(split.to_numpy()):
                    writer.write(serialize_example(rec))

### Model

In [None]:
embed_spec = {
    'user_id': (343810, 600, 0),
    'content_id': (13513, 300, 1),
    'part': (7, 5, 2),
    'tags': (189, 30, slice(3,9))
    }

class KnowledgeNet(tf.keras.Model):
    def __init__(self, embed_spec, n_act, l2_reg=1e-6, **kwargs):
        super(KnowledgeNet, self).__init__(**kwargs)
        self.embed_spec = embed_spec
        self.n_act = n_act
        self.l2_reg = l2_reg
        self.embed_dict = {}
        
        for k, v in self.embed_spec.items():
            input_dim, output_dim, _ = v
            self.embed_dict[k] = tf.keras.layers.Embedding(
                input_dim, output_dim,
                embeddings_initializer="he_normal",
                embeddings_regularizer=tf.keras.regularizers.l2(self.l2_reg),
                name=f'{k}_embedding'
            )
                        
        self.dense_1 = tf.keras.layers.Dense(
            input_dim=sum([v[0] for v in self.embed_spec.values()]), units=n_act, activation='relu', use_bias=True,
            kernel_initializer='glorot_uniform',
            bias_initializer='zeros', kernel_regularizer=tf.keras.regularizers.l2(self.l2_reg),
            bias_regularizer=None, activity_regularizer=None, kernel_constraint=None,
            bias_constraint=None, name='dense_1'
        )
        
        self.dense_2 = tf.keras.layers.Dense(
            input_dim=n_act, units=1, activation='sigmoid', use_bias=True,
            kernel_initializer='glorot_uniform',
            bias_initializer='zeros', kernel_regularizer=tf.keras.regularizers.l2(self.l2_reg),
            bias_regularizer=None, activity_regularizer=None, kernel_constraint=None,
            bias_constraint=None, name='dense_1'
        )
        
    def call(self, inputs):
        embeds = [self.embed_dict[k](inputs[:,v[-1]]) for k, v in self.embed_spec.items() if k != 'tags']
        embed_tags = self.embed_dict['tags'](inputs[:,self.embed_spec['tags'][-1]])
        embed_tags_agg = tf.reduce_prod(embed_tags, axis=1)
        x = self.dense_1(tf.concat(embeds + [embed_tags_agg], axis=1))        
        return self.dense_2(x)

In [None]:
with strategy.scope():
    model = KnowledgeNet(embed_spec, 1024)
    opt = tf.keras.optimizers.Adam(lr=.0001)
    loss_fn = tf.keras.losses.BinaryCrossentropy()
    metrics = ['binary_accuracy', 'AUC']
    model.compile(loss=loss_fn, optimizer=opt, metrics=metrics)

In [None]:
epochs = 2
batch_size = 2048 * strategy.num_replicas_in_sync
split_counts = {'train': 0, 'valid': 0}

for s in split_counts:
    for b in bucket.list_blobs(prefix=f'tfrecords/deep_collab-{s}'):
        split_counts[s] += int(b.name.split('-')[3].split('.')[0])

steps_per_epoch = split_counts['train'] // batch_size
val_steps = split_counts['valid'] // batch_size
total_steps = steps_per_epoch * epochs

split_counts, steps_per_epoch, val_steps, total_steps

In [None]:
if False:
    lr_finder = LRFinder(start=1e-7, end=0.1, steps=100)

    with strategy.scope():
        model.save_weights(f'gs://{BUCKET}/temp/temp-weights.ckpt')
        model.fit(get_ds_tfrec('train', batch_size), steps_per_epoch=lr_finder.steps, callbacks=[lr_finder])
        model.load_weights(f'gs://{BUCKET}/temp/temp-weights.ckpt')

    x = [lr_finder.start * (lr_finder.end / lr_finder.start) ** (s/lr_finder.steps) for s in range(lr_finder.steps)]

In [None]:
px.line(x=x, y=lr_finder.losses, log_x=True)

In [None]:
reference_lr = .0001
print(f'max_lr: {reference_lr:1.2e}')

one_cycle_kwargs = dict(
    total_steps=total_steps,
    steps_up_pct=0.3,
    steps_across_pct=.01,
    steps_down_pct=0.6,
    lr_max=reference_lr,
    lr_start_factor=1/25,
    lr_end_factor=1/10,
    decay=0.9999,
    mo_max=0.95,
    mo_min=0.85,
    verbose=1
)


one_cycle = OneCycleScheduler(**one_cycle_kwargs)
plot_lr_sched(one_cycle, total_steps)

In [None]:
history = model.fit(
    get_ds_tfrec('train', batch_size),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    validation_data=get_ds_tfrec('valid', batch_size),
    validation_steps=val_steps,
    callbacks=[one_cycle],
    verbose=1
)

In [None]:
model.summary()

In [None]:
model.save(f'gs://{BUCKET}/models/deep_collab_model', include_optimizer=True)

In [None]:
model = tf.saved_model.load(f'gs://{BUCKET}/models/collab_model_all')

In [None]:
weights = {c: df_data[c].cat.categories for c in cat_cols}
for v in model.variables:
    name = v.name.split('/')[1]
    weights[name] = v.numpy()
    weights[name] = np.append(weights[name], np.mean(weights[name], axis=0))

In [None]:
def get_pred(u_code, c_code, logits=False):
    user_vector = weights['user_embedding'][u_code]
    user_bias = weights['user_bias'][u_code]
    question_vector = weights['question_embedding'][c_code]
    question_bias = weights['question_bias'][c_code]
    
    logit = np.squeeze((user_vector * question_vector).sum() + user_bias + question_bias)
    
    if logits:
        return logit
    else:
        return 1 / (1 + np.math.exp(-logit))

In [None]:
np.save('weights_all.npy', weights, allow_pickle=True)

In [None]:
weights = np.load('weights_all.npy', allow_pickle=True).item()

In [None]:
bucket.blob('weights_all.npy').upload_from_filename('weights_all.npy')

In [None]:
def get_code_cols(df):
    return pd.concat([df[col].cat.codes for col in cat_cols], axis=1)

In [None]:
df_cat_cols = get_code_cols(df_train)

In [None]:
tqdm.pandas()

In [None]:
get_pred(43124, 9276, True)

In [None]:
df_preds = df_cat_cols.progress_apply(lambda r: get_pred(*r, logits=True), axis=1)

In [None]:
roc_auc_score(df_valid.answered_correctly, df_preds)

In [None]:
folds = range(5)

df_test_all = pd.read_gbq((f"""
    SELECT row_id, user_id, content_id, answered_correctly,
    FROM data.train t
    WHERE content_type_id = 0
    AND fold in ({(', '.join(list(map(str, folds))))}) 
    ORDER BY user_id, content_id
    """, '_q_'), use_bqstorage_api=True, progress_bar_type='tqdm_notebook')

In [None]:
df_test_all.to_pickle('df_test_all.pkl')

In [None]:
for c in cat_cols:
    df_test_all[c] = pd.Categorical(df_test_all[c], categories=weights[c])

In [None]:
df_code_cols = get_code_cols(df_test_all)

In [None]:
preds_all = df_code_cols.progress_apply(lambda r: get_pred(*r, logits=True), axis=1)

In [None]:
df_test_all['pred_collab_logit'] = preds_all

In [None]:
df_test_all[['row_id', 'pred_collab', 'pred_collab_logit']].to_pickle('df_pred_collab.pkl')

In [None]:
bucket.blob('df_pred_collab.pkl').upload_from_filename('df_pred_collab.pkl')

In [None]:
roc_auc_score(df_test_all.answered_correctly, preds_all)

## Custom Training Loop

In [None]:
BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE_GLOBAL = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

ds_trn_fit_iter = iter(strategy.experimental_distribute_dataset(get_ds_tfrec('train', batch_size=BATCH_SIZE_GLOBAL)))
ds_val_fit_iter = iter(strategy.experimental_distribute_dataset(get_ds_tfrec('valid', batch_size=BATCH_SIZE_GLOBAL)))

In [None]:
with strategy.scope():
    model = KnowledgeNet(n_users, n_questions, EMBEDDING_SIZE)
    
    loss_fn = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    
    def compute_loss(targets, outputs):
        per_example_loss = loss_fn(targets, outputs)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=BATCH_SIZE_GLOBAL)
    
    optimizer = tf.keras.optimizers.Adam(lr=0.0001)
    # metrics_trn = [tf.keras.metrics.Mean(name='loss'), tf.keras.metrics.BinaryAccuracy('accuracy'), tf.keras.metrics.AUC(name='roc_auc')]
    # metrics_val = [tf.keras.metrics.Mean(name='val_loss'), tf.keras.metrics.BinaryAccuracy('val_accuracy'), tf.keras.metrics.AUC(name='val_roc_auc')]
    metrics_trn = [tf.keras.metrics.Mean(name='loss')]
    metrics_val = [tf.keras.metrics.Mean(name='val_loss')]

In [None]:
@tf.function
def train_step(model, optimizer, loss_fn, metrics, ds_iter, steps_per_epoch):
    def train_step_fn(inputs, targets):
        with tf.GradientTape() as tape:
            outputs = model(inputs, training=True)
            loss = loss_fn(targets, outputs)
                            
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        for i, metric in enumerate(metrics):
            if i == 0:
                metric.update_state(loss)
            else:
                metric.update_state(targets, outputs)
    
    for _ in tf.range(steps_per_epoch):
        strategy.run(train_step_fn, next(ds_iter))
        
@tf.function
def val_step(model, loss_fn, metrics, ds_iter, steps_per_epoch):
    def val_step_fn(inputs, targets):
        outputs = model(inputs, training=False)
        loss = loss_fn(targets, outputs)
        
        for i, metric in enumerate(metrics):
            if i == 0:
                metric.update_state(loss)
            else:
                metric.update_state(targets, outputs)
    
    for _ in tf.range(steps_per_epoch):
        strategy.run(val_step_fn, next(ds_iter))

In [None]:
epochs = 5
steps_per_epoch = 10000
val_steps = 100
results = {metric.name: [] for metric in metrics_trn + metrics_val}
results['duration'] = []

start = time.perf_counter()
for epoch in range(epochs):
    epoch_start = time.perf_counter()
    train_step(model, optimizer, compute_loss, metrics_trn, ds_trn_fit_iter, steps_per_epoch)    
    val_step(model, compute_loss, metrics_val, ds_val_fit_iter, val_steps)
    
    for metric in metrics_trn + metrics_val:
        results[metric.name].append(metric.result().numpy())
        metric.reset_states()
        
    results['duration'].append((time.perf_counter() - epoch_start))
    
    print(f'epoch {epoch:02d} - ', (', ').join([f'{k}: {v[-1]:0.4f}' for k,v in results.items()]))

end = time.perf_counter()
seconds = end - start
minutes = seconds / 60 + (seconds % 60) / 60
print(f'Total duration: {minutes:0.1f} minutes - {seconds / epochs:0.1f} seconds per epoch')