In [None]:
""" Work bench for tuning NN model."""
%load_ext tensorboard

import os
import json
from pprint import pprint
from datetime import datetime
from collections import defaultdict

from dotenv import load_dotenv


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from transformers import (
    BertTokenizer,
    AutoTokenizer,
    AutoConfig,
    TFDistilBertModel,
    TFBertModel, 
    TFTrainingArguments
)
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from tc_data import TopCoder
from imbalanced_regression_metrics import PrecisionRecallFscoreForRegression, TFPrecisionRecallFscoreForRegression
from boosting_learn import EnsembleTrainer

load_dotenv()
pd.set_option('display.max_rows', 800)

In [None]:
def build_seq_reg_model(num_hidden_layer=1, layer_dim=512, name='a_model', input_shape=(36,)):
    """ Build sequential model with given hidden layer and dimensions"""
    return tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=input_shape, name='input_layer'),
        *[tf.keras.layers.Dense(layer_dim, activation='relu', name=f'layer_{i}') for i in range(num_hidden_layer)],
        tf.keras.layers.Dense(1, name='reg_unit')
    ], name=name)

In [None]:
# tc = TopCoder()

```python
meta_inputs = tf.keras.Input(shape=(36,), name='meta_data_input')
bert_inputs = tf.keras.Input(shape=(48,), name='pool_bert_input')

concat = tf.keras.layers.concatenate([bert_inputs, meta_inputs], name='concat')
dense = tf.keras.layers.Dense(512, activation='relu', name='pre_regression')(concat)

score_output = tf.keras.layers.Dense(4, name='score')(dense)
registration_output = tf.keras.layers.Dense(1, name='registration')(dense)
sub_reg_output = tf.keras.layers.Dense(1, name='sub_reg_ratio')(dense)

mtl_model = tf.keras.Model(inputs=[bert_inputs, meta_inputs], outputs=[score_output, registration_output, sub_reg_output])

tf.keras.utils.plot_model(mtl_model, to_file='model_arch/tmp.png', show_shapes=True)
```

Both number of hidden layer and hidden layer dimension can impact the trying result, I tried several combination of `(num_of_hidden_layer, dimension_of_a_hidden_layer)` to find the best scoring. And using `EarlyStopping` monitor to prevent overfitting.

In [None]:
layer_dimension = [
    (1, 1024),
    (2, 512),
    (4, 256),
    (8, 128),
]
monitor_threshold = {
    'avg_score': 1,
    'number_of_registration': 3,
    'sub_reg_ratio': 0.05,
}
prf_measurer = {
    'avg_score': PrecisionRecallFscoreForRegression(tE=0.6, tL=6, c=90, extreme='low', decay=0.1),
    'number_of_registration': PrecisionRecallFscoreForRegression(tE=0.6, tL=10, c=30, extreme='high'),
    'sub_reg_ratio': PrecisionRecallFscoreForRegression(tE=0.6, tL=0.1, c=0.25, extreme='high'),
}

# prf_score = TFPrecisionRecallFscoreForRegression(tE=0.6, tL=6, c=90, extreme='low', decay=0.1)
# prf_reg = TFPrecisionRecallFscoreForRegression(tE=0.6, tL=10, c=30, extreme='high')
# prf_sub = TFPrecisionRecallFscoreForRegression(tE=0.6, tL=0.1, c=0.25, extreme='high')

training_result = {}
for target in ('avg_score', 'number_of_registration', 'sub_reg_ratio'):
    for dv in (0, 1):
        X_train, y_train = EnsembleTrainer.read_dataset(target, 'train_resample', dv)
        X_test, y_test = EnsembleTrainer.read_dataset(target, 'test', dv)

        model_lst = [build_seq_reg_model(ld[0], ld[1], name=f'{target}_dv{dv}_ld{ld[0]}{ld[1]}', input_shape=(X_test.shape[1],)) for i, ld in enumerate(layer_dimension)]
        model_res = []
        for model in model_lst:
            print(f'Training model {model.name}')
            
            model.compile(
                optimizer=tf.keras.optimizers.Adam(2e-5),
                loss='mse',
                metrics=['mse', 'mae']
            )
            earlystop_cb = tf.keras.callbacks.EarlyStopping(monitor='val_mae', min_delta=monitor_threshold[target], patience=8, verbose=1)
            
            history = model.fit(X_train, y_train, validation_split=0.2, epochs=80, callbacks=[earlystop_cb])
            result = model.evaluate(X_test, y_test, return_dict=True)
            
            prf = prf_measurer[target]
            y_pred = model.predict(X_test).reshape(-1)
            result.update(
                precision=prf.precision(y_test, y_pred),
                recall=prf.recall(y_test, y_pred),
                fscore=prf.recall(y_test, y_pred),
            )
            
            model_res.append((model.name, result, history))
            
        best_model_name, best_model_result, best_model_hist = sorted(model_res, key=lambda mres: mres[1]['mae'])[0]
        training_result[(target, dv)] = (best_model_name, best_model_result, pd.DataFrame(best_model_hist.history))
        
        

In [None]:
test_score_df = pd.DataFrame.from_dict({k: v[1] for k, v in training_result.items()})
test_score_df.columns.names = ['target', 'dv']
test_score_df

In [None]:
df_dct = defaultdict(dict)
for target in ('avg_score', 'number_of_registration', 'sub_reg_ratio'):
    for dv in (0, 1):
        df_dct[target][f'contain_dv_{bool(dv)}'] = training_result[(target, dv)][1]['mae']
        
mae_by_targetdv = pd.DataFrame.from_dict(df_dct, orient='index')

In [None]:
mae_by_targetdv

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 2, figsize=(8, 9), dpi=200)
    for row, target in enumerate(('avg_score', 'number_of_registration', 'sub_reg_ratio')):
        for col, dv in enumerate((0, 1)):
            ax = axes[row, col]
            attr = 'loss'
            df = training_result[(target, dv)][2]
            res = training_result[(target, dv)][1]
            
            sns.lineplot(
                x=df.index,
                y=df[attr],
                label=f'train_{attr}',
                ax=ax
            )
            sns.lineplot(
                x=df.index,
                y=df[f'val_{attr}'],
                label=f'val_{attr}',
                ax=ax
            )
            ax.axhline(y=res[attr], xmax=0.3, color='red')
            ax.text(0, res[attr], round(res[attr], 3))
            
            ax.set_title(f'{target}|docvec: {bool(dv)}')
            
    fig.tight_layout()

In [None]:
X_train, y_train = EnsembleTrainer.read_dataset('avg_score', 'train_resample', 1)
X_test, y_test = EnsembleTrainer.read_dataset('avg_score', 'test', 1)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
prf_score = TFPrecisionRecallFscoreForRegression(tE=0.6, tL=6, c=90, extreme='low', decay=0.1)
prf_reg = TFPrecisionRecallFscoreForRegression(tE=0.6, tL=10, c=30, extreme='high')
prf_sub = TFPrecisionRecallFscoreForRegression(tE=0.6, tL=0.1, c=0.25, extreme='high')

In [None]:
model = build_seq_reg_model(5, 768, name='some_model', input_shape=(X_test.shape[1],))
model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss='mse',
    metrics=['mse', 'mae']
)

In [None]:
earlystop_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=3, patience=10, verbose=1)

history = model.fit(x=X_train, y=y_train, validation_split=0.2, epochs=150, callbacks=[earlystop_cb])

In [None]:
result = model.evaluate(X_test, y_test, return_dict=True)
result

In [None]:
y_pred = model.predict(X_test).reshape(-1)

In [None]:
prf_score.precision(y_test, y_pred), prf_score.recall(y_test, y_pred), prf_score.fscore(y_test, y_pred)

In [None]:
with sns.axes_style('white'):
    fig = plt.figure(figsize=(4, 4), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.scatterplot(
        x=y_test,
        y=y_pred,
        s=8,
        alpha=0.75,
        linewidth=0.5,
        ax=ax
    )
    sns.lineplot(x=[60, 100], y=[60, 100], color='red')
    ax.axhline(90, color='#AFAFAF', linewidth=0.75)
    ax.axvline(90, color='#AFAFAF', linewidth=0.75)
    ax.set_xlabel('Y_true')
    ax.set_ylabel('Y_pred')

In [None]:
train_hist = pd.DataFrame(history.history)

In [None]:
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(8, 6), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.lineplot(
        x=train_hist.index,
        y=train_hist.mae,
        ax=ax,
        label='train_mae'
    )
    sns.lineplot(
        x=train_hist.index,
        y=train_hist.val_mae,
        ax=ax,
        label='val_mae'
    )
    # sns.lineplot(
    #     x=train_hist.index,
    #     y=train_hist.val_precision,
    #     ax=ax,
    #     label='val_precision'
    # )
    ax.set_xticks(list(range(20)))
    # ax.set_yticks(list(range(0, 500, 50)))
    # ax.set_ylim(top=500, bottom=0)
    ax.set_yticks(list(range(0, 50, 5)))
    ax.set_ylim(0, 100)
#     ax.set_yticks(sorted([*[i for i in range(0, 10000, 2000)], *[i for i in range(1, 1000, 99)]]))

In [None]:
(
    prf.precision(y_test, y_pred),
    prf.recall(y_true=y_test, y_pred=y_pred),
    prf.fscore(y_test, y_pred)
)

In [None]:
build_seq_reg_model(1, 1024, input_shape=(136,)).name

In [None]:
sorted((2,1,3), reverse=True)