In [None]:
""" This notebook is for all the experiments of machine learning"""
import os
import json
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_classification
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold


from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

from pm_builder_6 import FILT_CHA_INFO, extract_abs_metadata, get_Xy

In [None]:
regression_fam = {
    'LinearRegression': 'linear',
    'Ridge': 'linear',
    'Lasso': 'linear',
    'ElasticNet': 'linear',
    'BayesianRidge': 'linear',
    'SVR': 'SVM',
    'GaussianProcessRegressor': 'gaussian_process',
    'DecisionTreeRegressor': 'tree',
    'RandomForestRegressor': 'ensemble',
    'AdaBoostRegressor': 'ensemble',
    'KNeighborsRegressor': 'neighbors'
}
regression_fam_sr = pd.Series(regression_fam)

def train_lst_of_model(X, y):
    """ Train and evaluate multiple models in order with given input X, y"""
    model_lst = [
        LinearRegression,
        Ridge,
        Lasso,
        ElasticNet,
        BayesianRidge,
        SVR,
        GaussianProcessRegressor,
        DecisionTreeRegressor,
        RandomForestRegressor,
        AdaBoostRegressor,
        KNeighborsRegressor
    ]
    
    pred_dct = {}
    for regressor in model_lst:
        reg = regressor()
        y_pred = cross_val_predict(reg, X.to_numpy(), y.to_numpy(), cv=10)
        
        result_df = pd.concat([y, pd.Series(y_pred, index=y.index)], axis=1)
        result_df.columns = ['actual', 'estimate']
        
        pred_dct[regressor.__name__] = ((result_df['actual'] - result_df['estimate']).abs() / result_df['actual']).mean()
        print(f'{regressor.__name__}: {pred_dct[regressor.__name__]}')
        
    return pd.Series(pred_dct)

In [None]:
X, y = get_Xy()

In [None]:
f2fcode_cha = FILT_CHA_INFO.loc[FILT_CHA_INFO.subtrack.isin(['FIRST_2_FINISH', 'CODE'])]
X_fc, y_fc = X.loc[X.index.isin(f2fcode_cha.index)], y.loc[y.index.isin(f2fcode_cha.index)]

In [None]:
reg_res_all = train_lst_of_model(X, y)
reg_res_fc = train_lst_of_model(X_fc, y_fc)

In [None]:
result_df = pd.concat([
    pd.DataFrame({'MMRE': reg_res_all, 'reg_fam': regression_fam_sr, 'data': ['all'] * len(reg_res_all)}),
    pd.DataFrame({'MMRE': reg_res_fc, 'reg_fam': regression_fam_sr, 'data': ['fc'] * len(reg_res_fc)})
]).reset_index().rename(columns={'index': 'regressor'})


In [None]:
with sns.axes_style('ticks'):
    fig = plt.figure(figsize=(8, 4), dpi=200)
    ax = fig.add_axes([0.1, 0.25, 0.8, 0.5])

    sns.despine(ax=ax)
    sns.scatterplot(
        data=result_df,
        x='regressor',
        y='MMRE',
        hue='data',
        style='data',
        ax=ax
    )
    
    ax.set_title('Prediction Accuracy of Different Regression Model')
    ax.set_xlabel('Regressor Name')

    xticklabels = [
        'Linear\nRegression',
        'Ridge',
        'Lasso', 
        'Elastic\nNet',
        'Bayesian\nRidge',
        'SVR',
        'GaussianProcess\nRegressor',
        'DecisionTree\nRegressor',
        'RandomForest\nRegressor',
        'AdaBoost\nRegressor',
        'KNeighbors\nRegressor'
    ]
    ax.set_xticklabels(labels=xticklabels, rotation=330, ha='left', rotation_mode='anchor', fontdict={'fontSize': 8})
    ax.set_yticks(list(range(1, 9)))
    ax.set_yticklabels(labels=list(range(1, 9)), fontdict={'fontSize': 10})
    
    for _, reg, mmre in result_df.reindex(['regressor', 'MMRE'], axis=1).itertuples():
        ax.text(
            x=list(result_df.loc[result_df.data == 'all']['regressor']).index(reg),
            y=mmre + 0.2,
            s=round(mmre, 2),
            ha='left',
            va='baseline',
            fontdict={'size': 8}
        )

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, ['Training Data', 'All subtrack', 'F2F & CODE'], prop={'size': 6})
        
fig.savefig('img/regression_models_accuracy.png', dpi='figure')

In [None]:
# PM6
brf_estimation = pd.read_json('pricing_model_6/round1_res/brf/prz_estimation.json')
rus_estimation = pd.read_json('pricing_model_6/round1_res/rus/prz_estimation.json')

In [None]:
((brf_estimation['actual'] - brf_estimation['median']).abs() / brf_estimation['actual']).mean()

In [None]:
((rus_estimation['actual'] - rus_estimation['median']).abs() / rus_estimation['actual']).mean()

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(4, 3), dpi=200)
    ax = fig.add_axes([0.15, 0.15, 0.7, 0.7])
    
    sns.scatterplot(
        x = [1, 3],
        y = [1.1893620144762707, 1.0917988368303686],
        ax=ax
    )
    ax.set_ylim(bottom=0.9, top=1.3)
    ax.set_xlim(left=0, right=4)
    ax.set_xticks([1, 3])
    ax.set_xticklabels(labels=['Balanced\nRandom Forest', 'Random\nUnder Sampler'])
    ax.set_title('Pairing Challenges Imbalanced Learning')
    ax.set_ylabel('MMRE')

    ax.text(
        x=1,
        y=1.1893620144762707 + 0.01,
        s=round(1.1893620144762707, 2),
        ha='left',
        va='baseline',
        fontdict={'size': 8}
    )
    
    ax.text(
        x=3,
        y=1.0917988368303686 + 0.01,
        s=round(1.0917988368303686, 2),
        ha='left',
        va='baseline',
        fontdict={'size': 8}
    )
    fig.savefig('img/pairing_cha_model_mmre.png', dpi='figure')

In [None]:
brf_feature_importance_df = pd.concat([pd.read_json(f'pricing_model_6/round1_res/brf/feature_importance_{i}.json', typ='series') for i in range(10)], axis=1)
brf_feature_importance = brf_feature_importance_df.mean(axis=1)

In [None]:
brf_feature_importance.apply(lambda i: round(i, 3)).sort_values().tail(5).index

In [None]:
rus_feature_importance_df = pd.concat([pd.read_json(f'pricing_model_6/round1_res/rus/feature_importance_{i}.json', typ='series') for i in range(10)], axis=1)
rus_feature_importance = rus_feature_importance_df.mean(axis=1)

In [None]:
with sns.axes_style('darkgrid'):
#     fig, (ax_rus, ax_brf) = plt.subplots(2, 1, figsize=(8, 4), dpi=200)
    fig = plt.figure(figsize=(8, 4), dpi=200)
    ax = fig.add_axes([0.05, 0.15, 0.9, 0.7])
    
    sns.barplot(x=rus_feature_importance.index, y=rus_feature_importance, ax=ax, linewidth=0)
    
#     sns.barplot(x=brf_feature_importance.index, y=brf_feature_importance, ax=ax_brf, linewidth=0)
    
    ax.set_xticklabels(
        labels=[i if i in brf_feature_importance.sort_values().tail(10).index else '' for i in range(73)], 
        rotation=330, 
        ha='right', 
        rotation_mode='anchor', 
        fontdict={'fontSize': 8}
    )
    
    
    