In [None]:
%matplotlib inline
import itertools
from datetime import datetime
from pprint import pprint

import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.colors as mcolors

from sklearn import set_config
from sklearn.model_selection import (
    train_test_split,
    ShuffleSplit,
    StratifiedShuffleSplit,
    GroupShuffleSplit,
    cross_val_predict,
    cross_val_score,
)
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from gensim.sklearn_api import D2VTransformer
from gensim.utils import simple_preprocess

import topcoder_mongo as DB
import topcoder_ml as TML
import static_var as S
import util as U

sns.set(
    rc={
        'axes.facecolor':'#121212',
        'figure.facecolor':'#121212',
        'text.color': 'white',
        'axes.titlecolor': 'white',
        'axes.labelcolor': 'white',
        'xtick.color': 'white',
        'ytick.color': 'white',
        'figure.autolayout': True,
    },
)

pd.set_option('display.max_rows', 500)

set_config(display='diagram')

## Cross Validation

### Get processed training data

The feature columns are composed as follow:

1. $d_0, d_1, d_2 ~ d_5 $: Numeric features `duration`, `num_of_competing_challenges`, `softmax_c1` to `softmax_c4`;
2. $d_6, d_7 $: Categorical features `project_id` and `sub_track`;
3. $d_8 - d_{107} $: One Hot Encoded tag and tag combination;
4. $d_{108} - d_{207} $: Document vector for challenge description text representation.

In [None]:
feature, target = TML.get_training_data()

### Different `random_state` when do `train_test_split`

Due to the imbalanced distribution of trianing target (`top2_prize`), when randomly split the training and testing data set, different random seeding will result in flucuated trianing and testing score.

Below is the demonstration of different `random_state` that result in different performance.

In [None]:
%%script false --no-raise-error
random_states = [0, 21, 42, None]
result = []
for random_state in random_states:
    (
        feature_train,
        feature_test,
        target_train,
        target_test,
    ) = train_test_split(feature, target, test_size=0.3, random_state=random_state)
    
    train_score = np.abs(np.mean(cross_val_score(
        TML.construct_training_pipeline(),
        feature_train,
        target_train.to_numpy().reshape(-1),
        scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
        cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=42),
    )))
    
    test_est = TML.construct_training_pipeline()
    test_est.fit(feature_train, target_train.to_numpy().reshape(-1))
    test_pred = test_est.predict(feature_test)
    
    test_score = TML.mean_magnitude_of_relative_error(target_test.to_numpy().reshape(-1), test_pred)
    
    result.append({'random_state': str(random_state), 'train_score': train_score, 'test_score': test_score})

pd.DataFrame.from_records(result)

In [None]:
%%script false --no-raise-error
score = cross_val_score(
    TML.construct_training_pipeline(),
    feature,
    target.to_numpy().reshape(-1),
    scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
)
np.abs(np.mean(score))

In [None]:
%%script false --no-raise-error
est = TML.construct_training_pipeline()
est.fit(feature_train, target_train.to_numpy().reshape(-1))
pred = est.predict(feature_test)

TML.mean_magnitude_of_relative_error(target_test.to_numpy().reshape(-1), pred)

In [None]:
# impt = pd.DataFrame(est['gbr'].feature_importances_).rename(columns={0: 'importance'}, index=dict(enumerate(feature.columns.tolist())))
# impt.sort_values('importance', ascending=False).head(25)

### Use `cross_validate_predict` to get the prediction of all dataset

In [None]:
%%script false --no-raise-error
est_for_all = TML.construct_training_pipeline()
pred = cross_val_predict(est_for_all, feature, target.to_numpy().reshape(-1), cv=10)

all_data_result = pd.concat([target.reset_index(), pd.DataFrame(pred)], axis=1).rename(columns={0: 'pred'})
all_data_result['mae'] = all_data_result['top2_prize'] - all_data_result['pred']
all_data_result['mre'] = all_data_result['mae'].abs() / all_data_result['top2_prize']
all_data_result['mre'].mean()

### Iterate over multiple learning algorithms for training score and testing score



In [None]:
%%script false --no-raise-error
estimator_lst = [
    (GradientBoostingRegressor, dict(random_state=42)),
    (RandomForestRegressor, dict(random_state=42)),
    (LinearRegression, {}),
    (BayesianRidge, {}),
    (SVR, {}),
]

result = []
for est, estp in estimator_lst:
    print('Training', est.__name__)
    train_score = np.abs(np.mean(cross_val_score(
        TML.construct_training_pipeline(estimator=est, est_param=estp),
        feature_train,
        target_train.to_numpy().reshape(-1),
        scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
        cv=10, # ShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
    )))
    
    estimator = TML.construct_training_pipeline(estimator=est, est_param=estp)
    estimator.fit(feature_train, target_train.to_numpy().reshape(-1))
    test_pred = estimator.predict(feature_test)

    test_score = TML.mean_magnitude_of_relative_error(target_test.to_numpy().reshape(-1), test_pred)
    
    result.append({'estimator': est.__name__, 'train_score': train_score, 'test_score': test_score})

## Evaluate model using only partial of feature matrix

In [None]:
# Some pipelime function to run for training pipeline
def construct_metadata_pipeline():
    """ Construct pipeline for metadata features."""
    return Pipeline([
        ('col', ColumnTransformer([
            ('standardization', StandardScaler(), ['duration'])
        ], remainder='passthrough')),
        ('gbr', GradientBoostingRegressor(random_state=42)),
    ])
    
def construct_global_context_pipeline():
    """ Construct pipeline for global context feature."""
    return Pipeline([
        ('std', StandardScaler()),
        ('gbr', GradientBoostingRegressor(random_state=42)),
    ])

def construct_tag_feature_pipeline():
    """ Construct pipeline for tag softmax and one-hot encoded."""
    return Pipeline([
        ('col', ColumnTransformer([
            ('standardization', StandardScaler(), S.TAG_SOFTMAX_COLUMNS)
        ], remainder='passthrough')),
        ('gbr', GradientBoostingRegressor(random_state=42)),
    ])

def construct_doc2vec_pipeline():
    """ Construct pipeline for doc2vec model transformer."""
    def preprocess_text(df):
        return df['processed_paragraph'].apply(simple_preprocess).to_list()
    
    return Pipeline([
        ('preprocess_text', FunctionTransformer(preprocess_text)),
        ('doc2vec', D2VTransformer(size=100, min_count=5, iter=10)),
        ('gbr', GradientBoostingRegressor(random_state=42)),
    ])

### Get `top2prize` target along with the limited scope

In [None]:
challenge_prize = (pd.DataFrame
                   .from_records(DB.TopcoderMongo.run_feature_aggregation([
                       {'$project': {'_id': False, 'id': True, 'top2_prize': True}}
                   ]))
                   .set_index('id'))

challenge_by_project_scale = (pd.DataFrame
                              .from_records(DB.TopcoderMongo.get_project_scale([0, 10]))
                              .set_index('tag')
                              .loc['>=10', 'challenge_lst'])
low, high = challenge_prize['top2_prize'].quantile(0.05), challenge_prize['top2_prize'].quantile(0.95)

challenge_prize = challenge_prize.loc[
    (challenge_prize['top2_prize'] >= low) &
    (challenge_prize['top2_prize'] <= high) &
    challenge_prize.index.isin(challenge_by_project_scale)
]

### Use only metadata

In [None]:
query = [
    {'$project': {'_id': False, 'id': True, 'metadata': True}}
]

metadata = (pd.DataFrame
            .from_records(challenge_prize
                          .join(pd.DataFrame
                                .from_records(DB.TopcoderMongo.run_feature_aggregation(query))
                                .set_index('id'))
                          .pop('metadata'),
                          index=challenge_prize.index,
                          columns=S.META_DATA_COLUMNS))

metadata_score = np.abs(np.mean(cross_val_score(
    construct_metadata_pipeline(), metadata, challenge_prize.to_numpy().reshape(-1),
    scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=42),
)))
metadata_score

### Use only Global Context

In [None]:
query = [
    {'$project': {
        '_id': False, 'id': True,
        **{col: True for col in S.GLOBAL_CONTEXT_COLUMNS}
    }}
]

global_context = (challenge_prize
                  .join(
                      pd.DataFrame
                      .from_records(DB.TopcoderMongo.run_feature_aggregation(query))
                      .set_index('id'))
                  .reindex(S.GLOBAL_CONTEXT_COLUMNS, axis=1))

global_context_score = np.abs(np.mean(cross_val_score(
    construct_global_context_pipeline(),
    global_context,
    challenge_prize.to_numpy().reshape(-1),
    scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=42),
)))
global_context_score

### Use only Tags

In [None]:
query = [
    {'$project': {
        '_id': False,
        'id': True,
        'vector': {
            '$concatArrays': [
                f'$softmax_dim{S.CHALLENGE_TAG_OHE_DIM}',
                f'$one_hot_dim{S.CHALLENGE_TAG_OHE_DIM}',
            ]
        },
    }}
]

tags = pd.DataFrame.from_records(
    challenge_prize
    .join(pd.DataFrame
          .from_records(DB.TopcoderMongo.run_feature_aggregation(query))
          .set_index('id'))
    .pop('vector'),
    index=challenge_prize.index,
    columns=S.TAG_SOFTMAX_COLUMNS + S.TAG_OHE_COLUMNS,
)

tags_score = np.abs(np.mean(cross_val_score(
    construct_tag_feature_pipeline(), tags, challenge_prize.to_numpy().reshape(-1),
    scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=42),
)))
tags_score

### Use only Doc2Vec

In [None]:
challenge_description = (challenge_prize
                         .join(pd.DataFrame
                               .from_records(DB.TopcoderMongo.get_challenge_description())
                               .set_index('id'))
                         .pop('processed_paragraph')
                         .to_frame())

challenge_description

In [None]:
(challenge_description.index == challenge_prize.index).all()

In [None]:
d2v_score = np.abs(np.mean(cross_val_score(
    construct_doc2vec_pipeline(),
    challenge_description,
    challenge_prize.to_numpy().reshape(-1),
    scoring=make_scorer(TML.mean_magnitude_of_relative_error, greater_is_better=False),
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=42),
)))
d2v_score

## Difference of dimensions for Tags OHE and Doc2Vec

In [None]:
exclusion_df = pd.read_json('./cross_validation_result.json')
exclusion_df['excluded_metadata'] = exclusion_df['excluded_metadata'].apply(lambda l: ''.join(l))
exclusion_df['excluded_global_context'] = exclusion_df['excluded_global_context'].apply(lambda l: ''.join(l))

In [None]:
dimension_change = exclusion_df.loc[
    ~exclusion_df['excluded_global_context'].astype(bool) &
    ~exclusion_df['excluded_metadata'].astype(bool)
].reindex(columns=['tag_ohe_dimension', 'doc2vec_dimension', 'mmre']).round(3)

ohe_by_d2v = dimension_change.pivot(index='tag_ohe_dimension', columns='doc2vec_dimension', values='mmre')

In [None]:
ohe_by_d2v.max().max(), ohe_by_d2v.min().min()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), dpi=200)

sns.heatmap(
    data=ohe_by_d2v, vmin=0.35, vmax=0.358,
    square=True,
    cmap=sns.diverging_palette(125, 25, as_cmap=True),
    cbar=False,
    annot=ohe_by_d2v, fmt='.3f',
    ax=ax
)

ax.set_xlabel('Doc2Vec Dimension')
ax.set_ylabel('Challenge Tags OHE Dimension')
ax.xaxis.tick_top()
ax.tick_params(length=0)
ax.xaxis.set_label_position('top')

fig.savefig('../../presentation/presentation8/dimension_change.png', dpi='figure', transparent=True)

In [None]:
# metadata leave one out
(exclusion_df
 .loc[
     exclusion_df['excluded_global_context'].astype(bool) |
     (
         (exclusion_df['excluded_metadata'] == '') &
         (exclusion_df['excluded_global_context'] == '') &
         (exclusion_df['tag_ohe_dimension'] == 100) &
         (exclusion_df['doc2vec_dimension'] == 100)
     )
 ]
 .reindex(columns=['excluded_global_context', 'mmre'])
 .reset_index(drop=True)
 .round(3)
 .to_clipboard(excel=False))


## Evaluate model using Timeseries split

For every month of challenge, predict the prize using the model trained by previous months data.
The month window are `[3, 6, 9, 12]`

In [None]:
time_result_dct = {}
for train_time_span in [1, 3, 6, 9, 12]:
    time_result_dct[train_time_span] = TML.cross_validation_with_time_window(feature, target, train_time_span)

In [None]:
ts_result = pd.concat([
    pd.DataFrame(result).rename(columns=dict(enumerate(['ts', tw]))).set_index('ts')
    for tw, result in time_result_dct.items()
], axis=1).round(3)

In [None]:
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.lineplot(
    data=ts_result,
    lw=2.5,
    ax=ax
)
sns.despine(ax=ax, left=True, bottom=True)
ax.set_ylim(0, 1)
ax.xaxis.grid(False)
ax.yaxis.grid(True, color='w', alpha=0.5)

ax.set_title('Monthly MMRE with different time window')
ax.set_xlabel('Month')
ax.set_ylabel('MMRE')

ax.axhline(0.354, color='red', alpha=0.85)
ax.text(ax.get_xticks()[1] - 85, 0.36, 'baseline', color='red', alpha=0.85)
fig.savefig('../../presentation/presentation8/monthly_mmre_line.png', dpi='figure', transparent=True)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8), dpi=200)

flierprops = {'marker': 'o', 'markerfacecolor': 'white', 'markeredgewidth': 0.5, 'markersize': 2.5}
sns.boxplot(
    data=ts_result,
    linewidth=0.8,
    flierprops=flierprops,
    boxprops=dict(edgecolor='white'),
    medianprops=dict(color='white'),
    whiskerprops=dict(color='white'),
    capprops=dict(color='white'),
    width=0.618,
    ax=ax
)
sns.despine(ax=ax, left=True)
ax.yaxis.grid(True, alpha=0.5)
ax.set_ylabel('MMRE')
ax.set_xlabel('Time Window')

fig.savefig('../../presentation/presentation8/monthly_mmre_box.png', dpi='figure', transparent=True)