In [None]:
""" Notebook for final model building evaluation"""

import os
import json
from pprint import pprint
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, scale, normalize
from sklearn.model_selection import (
    cross_val_predict,
    cross_val_score,
    cross_validate,
    KFold
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    make_scorer,
)
from sklearn.pipeline import make_pipeline

from dotenv import load_dotenv

from tc_data import TopCoder
from final_model_selection import (
    kfold_predict_validate_gradient_boosting,
    kfold_predict_validate_neural_network
)

load_dotenv()
pd.set_option('display.max_rows', 800)

## Get the data object

`TopCoder` class contain the pre-processed data retrieved from data base.

In [None]:
tc = TopCoder()

## Compare Gradient Boosting against NN

In [None]:
hyper_param_dct = {
    'avg_score': {'gb': {}, 'nn': {}},
    'number_of_registration': {'gb': dict(tol=4), 'nn': dict(es_min_delta=4)},
    'sub_reg_ratio': {'gb': dict(loss='lad', tol=0.001, n_iter_no_change=10), 'nn': dict(es_min_delta=0.001)},
    'total_prize': {'gb': dict(loss='ls', tol=10, n_iter_no_change=5), 'nn': dict(es_min_delta=10)}
}

In [None]:
res = {}
for target, param_dct in hyper_param_dct.items():
    X, y = tc.build_final_dataset(target)
    gb_y_pred, gb_cv_eval_df, gb_manual_score, gb_cv_fi = kfold_predict_validate_gradient_boosting(X, y, **param_dct['gb'])
    nn_y_pred, nn_cv_eval_df, nn_manual_score = kfold_predict_validate_neural_network(X, y, **param_dct['nn'])
    
    res[target] = dict(
        y=y,
        gb_y_pred=gb_y_pred,
        nn_y_pred=nn_y_pred,
        gb_cv_mean=gb_cv_eval_df.mean(),
        nn_cv_mean=nn_cv_eval_df.mean(),
        gb_manual=pd.Series(gb_manual_score),
        nn_manual=pd.Series(nn_manual_score)
    )

In [None]:
multi_idx_dct = defaultdict(dict)
algo_name = {
    'gb': 'Gradient Boosting',
    'nn': 'Neural Network',
}
result_name = {
    'cv_mean': 'CV Mean',
    'manual': 'Full Set',
}

for target in res:
    for algo in ('gb', 'nn'):
        for result in ('cv_mean', 'manual'):
            for metric, score in res[target][f'{algo}_{result}'].iteritems():
                multi_idx_dct[(metric, result_name[result])][(target, algo_name[algo])] = score

In [None]:
prediction_metrics = pd.DataFrame.from_dict(multi_idx_dct, orient='index').sort_index()
prediction_metrics

In [None]:
endpoint = [100, 300, 1, 2500]

with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(4, 2, figsize=(8, 16), dpi=200)
    for i, (target, predict_res) in enumerate(res.items()):
        for j, (algo, title) in enumerate(algo_name.items()):
            ax = axes[i, j]
            sns.scatterplot(
                x=predict_res[f'{algo}_y_pred'],
                y=predict_res['y'],
                s=20,
                alpha=0.5,
                linewidth=0.1,
                ax=ax,
            )
            sns.lineplot(
                x=[0, endpoint[i]],
                y=[0, endpoint[i]],
                color='red',
                alpha=0.75,
                ax=ax,
            )
            ax.set_title(f'{target.capitalize()} - {title}')
            ax.set_xlabel('Prediction')
            ax.set_ylabel('Ground Truth')
    fig.tight_layout()
    
#     fig.savefig('./result/img/pred_against_truth.png', dpi='figure')

## Inspect the feature importance of Gradient Boosting

In [None]:
gb_param_dct = {k: v['gb'] for k, v in hyper_param_dct.items()}
del gb_param_dct['total_prize']
gb_param_dct

In [None]:
gb_res = {}
for target, param in gb_param_dct.items():
    X, y = tc.build_final_dataset(target)
    y_pred, cv_eval_df, manual_score, cv_feature_importance = kfold_predict_validate_gradient_boosting(X, y, **param)

    gb_res[target] = dict(
        y=y,
        y_pred=y_pred,
        cv_mean=cv_eval_df.mean(),
        manual=pd.Series(manual_score),
        fea_importance = cv_feature_importance.mean()
    )
 

In [None]:
midx_dct = defaultdict(dict)
for target, res_dct in gb_res.items():
    for result in 'cv_mean', 'manual':
        for metric, score in res_dct[result].items():
            midx_dct[target][(metric, result)] = score
        
res_df = pd.DataFrame.from_dict(midx_dct).sort_index()
res_df

In [None]:
res_df.to_json('result/final_models/extened_features_gradient_boosting_result.json')

In [None]:
fea_imp_dct = {
    target: res_dct['fea_importance'] for target, res_dct in gb_res.items()
}

In [None]:
palette = [f'#{color}' for color in'641220-6e1423-85182a-a11d33-a71e34-b21e35-bd1f36-c71f37-da1e37-e01e37'.split('-')]

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 1, figsize=(8, 8), dpi=200, sharex=True)
    for i, (target, feature_imp) in enumerate(fea_imp_dct.items()):
        ax = axes[i]

        sns.barplot(
            x=feature_imp.sort_values(ascending=False).head(10),
            y=feature_imp.sort_values(ascending=False).head(10).index,
            ax=ax,
            palette=palette[::-1]
        )
        ax.set_title('{} - Feature Importance'.format(' '.join([w.capitalize() for w in target.split('_')])))
        ax.set_xlim(0, 1)
        
        for p in ax.patches:
            x = p.get_width() + 0.01
            y = p.get_y() + 0.5 * p.get_height() + 0.002
            importance = round(p.get_width(), 3)
            ax.text(x, y, importance, va='center', fontdict={'fontsize': 8})
            
    fig.tight_layout()
#     fig.savefig('result/img/gradient_boosting_feature_importance.png', dpi='figure')