In [None]:
""" Notebook for final model building evaluation"""

import os
import json
from pprint import pprint
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, scale, normalize
from sklearn.model_selection import (
    cross_val_predict,
    cross_val_score,
    cross_validate,
    KFold
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    make_scorer,
)
from sklearn.pipeline import make_pipeline

from dotenv import load_dotenv

from tc_data import TopCoder
from final_model_selection import (
    kfold_predict_validate_gradient_boosting,
    kfold_predict_validate_neural_network
)

load_dotenv()
pd.set_option('display.max_rows', 800)

## Get the data object

`TopCoder` class contain the pre-processed data retrieved from data base.

In [None]:
tc = TopCoder()

### AVG SCORE

In [None]:
X, y = tc.build_final_dataset('avg_score')
X.shape, y.shape

In [None]:
gb_y_pred, gb_cv_eval_df, gb_manual_score = kfold_predict_validate_gradient_boosting(X, y)
nn_y_pred, nn_cv_eval_df, nn_manual_score = kfold_predict_validate_neural_network(X, y)

In [None]:
cv_mean_df = pd.concat([
    gb_cv_eval_df.mean().rename('Gradient Boosting CV Mean'),
    nn_cv_eval_df.mean().rename('Neural Network CV Mean')
], axis=1)
cv_mean_df

In [None]:
manual_pred_df = pd.concat([
    pd.Series(gb_manual_score).round(6).rename('Gradient Boosting Manual Score'),
    pd.Series(nn_manual_score).round(6).rename('Neural Network Manual Score'),
], axis=1)
manual_pred_df

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    for i, y_pred in enumerate((gb_y_pred, nn_y_pred)):
        ax = axes[i]
        sns.scatterplot(
            x=y_pred,
            y=y,
            s=20,
            alpha=0.5,
            linewidth=0.2,
            ax=ax
        )
        sns.lineplot([0, 100], [0, 100], linewidth=0.8, alpha=0.5, color='red', ax=ax)
        ax.set_xlabel('Prediction')
        ax.set_ylabel('Groud Truth')
        ax.set_title('Nueral Network' if i else 'Gradient Boosting')
        
    fig.tight_layout()

## Number of Registration

In [None]:
X, y = tc.build_final_dataset('number_of_registration')
X.shape, y.shape

In [None]:
gb_y_pred, gb_cv_eval_df, gb_manual_score = kfold_predict_validate_gradient_boosting(X, y, tol=4)
nn_y_pred, nn_cv_eval_df, nn_manual_score = kfold_predict_validate_neural_network(X, y, es_min_delta=4)

In [None]:
cv_mean_df = pd.concat([
    gb_cv_eval_df.mean().rename('Gradient Boosting CV Mean'),
    nn_cv_eval_df.mean().rename('Neural Network CV Mean')
], axis=1)
cv_mean_df

In [None]:
manual_pred_df = pd.concat([
    pd.Series(gb_manual_score).round(6).rename('Gradient Boosting Manual Score'),
    pd.Series(nn_manual_score).round(6).rename('Neural Network Manual Score'),
], axis=1)
manual_pred_df

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    for i, y_pred in enumerate((gb_y_pred, nn_y_pred)):
        ax = axes[i]
        sns.scatterplot(
            x=y_pred,
            y=y,
            s=20,
            alpha=0.5,
            linewidth=0.2,
            ax=ax
        )
        sns.lineplot([0, 300], [0, 300], linewidth=0.8, alpha=0.5, color='red', ax=ax)
        ax.set_xlabel('Prediction')
        ax.set_ylabel('Groud Truth')
        ax.set_title('Nueral Network' if i else 'Gradient Boosting')
        
    fig.tight_layout()

## Sub Reg Ratio

In [None]:
X, y = tc.build_final_dataset('sub_reg_ratio')
X.shape, y.shape

In [None]:
gb_y_pred, gb_cv_eval_df, gb_manual_score = kfold_predict_validate_gradient_boosting(X, y, loss='lad', tol=0.001, n_iter_no_change=10)
nn_y_pred, nn_cv_eval_df, nn_manual_score = kfold_predict_validate_neural_network(X, y, es_min_delta=0.001)

In [None]:
cv_mean_df = pd.concat([
    gb_cv_eval_df.mean().rename('Gradient Boosting CV Mean'),
    nn_cv_eval_df.mean().rename('Neural Network CV Mean')
], axis=1)
cv_mean_df

In [None]:
manual_pred_df = pd.concat([
    pd.Series(gb_manual_score).round(6).rename('Gradient Boosting Manual Score'),
    pd.Series(nn_manual_score).round(6).rename('Neural Network Manual Score'),
], axis=1)
manual_pred_df

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    for i, y_pred in enumerate((gb_y_pred, nn_y_pred)):
        ax = axes[i]
        sns.scatterplot(
            x=y_pred,
            y=y,
            s=20,
            alpha=0.5,
            linewidth=0.2,
            ax=ax
        )
        sns.lineplot([0, 1], [0, 1], linewidth=0.8, alpha=0.5, color='red', ax=ax)
        ax.set_xlabel('Prediction')
        ax.set_ylabel('Groud Truth')
        ax.set_title('Nueral Network' if i else 'Gradient Boosting')
        
    fig.tight_layout()

## Total Prize

In [None]:
X, y = tc.build_final_dataset('total_prize')
X.shape, y.shape

In [None]:
gb_y_pred, gb_cv_eval_df, gb_manual_score = kfold_predict_validate_gradient_boosting(X, y, loss='ls', tol=10, n_iter_no_change=5)
nn_y_pred, nn_cv_eval_df, nn_manual_score = kfold_predict_validate_neural_network(X, y, es_min_delta=10)

In [None]:
cv_mean_df = pd.concat([
    gb_cv_eval_df.mean().rename('Gradient Boosting CV Mean'),
    nn_cv_eval_df.mean().rename('Neural Network CV Mean')
], axis=1)
cv_mean_df

In [None]:
manual_pred_df = pd.concat([
    pd.Series(gb_manual_score).round(6).rename('Gradient Boosting Manual Score'),
    pd.Series(nn_manual_score).round(6).rename('Neural Network Manual Score'),
], axis=1)
manual_pred_df

In [None]:
pd.DataFrame.from_dict({
    ('Gradient Boosting', 'CV Mean'): gb_cv_eval_df.mean(),
    ('Gradient Boosting', 'Full Set'):  pd.Series(gb_manual_score),
    ('Neural Network', 'CV Mean'): nn_cv_eval_df.mean(),
    ('Neural Network', 'Full Set'): pd.Series(nn_manual_score),
})

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    for i, y_pred in enumerate((gb_y_pred, nn_y_pred)):
        ax = axes[i]
        sns.scatterplot(
            x=y_pred,
            y=y,
            s=20,
            alpha=0.5,
            linewidth=0.2,
            ax=ax
        )
        sns.lineplot([0, 2500], [0, 2500], linewidth=0.8, alpha=0.5, color='red', ax=ax)
        ax.set_xlabel('Prediction')
        ax.set_ylabel('Groud Truth')
        ax.set_title('Nueral Network' if i else 'Gradient Boosting')
        
    fig.tight_layout()

## Try to put it all together

In [None]:
hyper_param_dct = {
    'avg_score': {'gb': {}, 'nn': {}},
    'number_of_registration': {'gb': dict(tol=4), 'nn': dict(es_min_delta=4)},
    'sub_reg_ratio': {'gb': dict(loss='lad', tol=0.001, n_iter_no_change=10), 'nn': dict(es_min_delta=0.001)},
    'total_prize': {'gb': dict(loss='ls', tol=10, n_iter_no_change=5), 'nn': dict(es_min_delta=10)}
}

res = {}
for target, param_dct in hyper_param_dct.items():
    X, y = tc.build_final_dataset(target)
    gb_y_pred, gb_cv_eval_df, gb_manual_score = kfold_predict_validate_gradient_boosting(X, y, **param_dct['gb'])
    nn_y_pred, nn_cv_eval_df, nn_manual_score = kfold_predict_validate_neural_network(X, y, **param_dct['nn'])
    
    res[target] = dict(
        y=y,
        gb_y_pred=gb_y_pred,
        nn_y_pred=nn_y_pred,
        gb_cv_mean=gb_cv_eval_df.mean(),
        nn_cv_mean=nn_cv_eval_df.mean(),
        gb_manual=pd.Series(gb_manual_score),
        nn_manual=pd.Series(nn_manual_score)
    )

In [None]:
multi_idx_dct = defaultdict(dict)
algo_name = {
    'gb': 'Gradient Boosting',
    'nn': 'Neural Network',
}
result_name = {
    'cv_mean': 'CV Mean',
    'manual': 'Full Set',
}

for target in res:
    for algo in ('gb', 'nn'):
        for result in ('cv_mean', 'manual'):
            for metric, score in res[target][f'{algo}_{result}'].iteritems():
                multi_idx_dct[(metric, result_name[result])][(target, algo_name[algo])] = score

In [None]:
prediction_metrics = pd.DataFrame.from_dict(multi_idx_dct, orient='index').sort_index()
prediction_metrics

In [None]:
endpoint = [100, 300, 1, 2500]

with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(4, 2, figsize=(8, 16), dpi=200)
    for i, (target, predict_res) in enumerate(res.items()):
        for j, (algo, title) in enumerate(algo_name.items()):
            ax = axes[i, j]
            sns.scatterplot(
                x=predict_res[f'{algo}_y_pred'],
                y=predict_res['y'],
                s=20,
                alpha=0.5,
                linewidth=0.1,
                ax=ax,
            )
            sns.lineplot(
                x=[0, endpoint[i]],
                y=[0, endpoint[i]],
                color='red',
                alpha=0.75,
                ax=ax,
            )
            ax.set_title(f'{target.capitalize()} - {title}')
            ax.set_xlabel('Prediction')
            ax.set_ylabel('Ground Truth')
    fig.tight_layout()
    
#     fig.savefig('./result/img/pred_against_truth.png', dpi='figure')