In [None]:
%matplotlib inline
import itertools
from pprint import pprint

import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.colors as mcolors

from sklearn.model_selection import (
    train_test_split,
    ShuffleSplit,
    StratifiedShuffleSplit,
    GroupShuffleSplit,
    cross_val_predict,
    cross_val_score
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

import topcoder_mongo as DB
import topcoder_ml as TML
import static_var as S
import util as U

sns.set(
    rc={
        'axes.facecolor':'#121212',
        'figure.facecolor':'#121212',
        'text.color': 'white',
        'axes.titlecolor': 'white',
        'axes.labelcolor': 'white',
        'xtick.color': 'white',
        'ytick.color': 'white',
        'figure.autolayout': True,
    },
)

pd.set_option('display.max_rows', 500)

A practical problem that I can never figure out:

**When should I standardize my dataset and should I standardize all features??**

## Retrieve training data

In [None]:
feature, target = TML.get_training_data()
X, y = feature.to_numpy(), target.to_numpy()

In [None]:
target

Let's visualize the distribution of `top2_prize`. I plot the frequency of different prize in a $50 interval.

In [None]:
target.top2_prize.min(), target.top2_prize.max()

In [None]:
bins = int((2700 - 300) / 50)
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.histplot(x=target.top2_prize, bins=bins, lw=0.5, ax=ax)
sns.despine(ax=ax, left=True)
ax.set_xlim(300, 2700)
ax.xaxis.grid(False)
ax.yaxis.grid(True, color='white', alpha=0.5)
ax.set_title('Top2 Prize Distribution')
ax.set_xlabel('Top2 Prize')
ax.xaxis.set_major_locator(mticker.MultipleLocator(100))

for p in ax.patches:
    cnt = p.get_height()
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(int(cnt), xy=(x, y), xytext=(x, y + 5), color='white', alpha=0.85, ha='center')

I decide to run a Mongo query to get the challenge ids for each bin, because using `pandas` to achieve that will take more tweak and twist that just run a (still relatively complicated) query.

In [None]:
challenge_prize_with_tag = TML.get_challenge_prize_range()

In [None]:
full_data_count = challenge_prize_with_tag.groupby('prize_range').size().to_frame().reset_index().rename(columns={0: 'count'})
full_data_count['sort_key'] = (full_data_count['prize_range']
                                .str.strip('[]()')
                                .str.split(', ')
                                .apply(lambda t: float(t[0])))
full_data_count = full_data_count.sort_values('sort_key').reset_index(drop=True)

test_data = challenge_prize_with_tag.groupby('prize_range').sample(frac=0.15, random_state=42).reset_index(drop=True).rename(columns={0: 'count'})
test_data['sort_key'] = (test_data['prize_range']
                        .str.strip('[]()')
                        .str.split(', ')
                        .apply(lambda t: float(t[0])))
test_data = test_data.sort_values('sort_key').reset_index(drop=True)

test_data_count = test_data.groupby('prize_range').size().to_frame().reset_index().rename(columns={0: 'count'})
test_data_count['sort_key'] = (test_data_count['prize_range']
                        .str.strip('[]()')
                        .str.split(', ')
                        .apply(lambda t: float(t[0])))
test_data_count = test_data_count.sort_values('sort_key').reset_index(drop=True)

plotting_data = (full_data_count.merge(test_data_count,
                                       how='outer', on='prize_range', suffixes=('_full', '_test'))
                                 .fillna(0)
                                 .append({
                                    'prize_range': '[2450.0, 2500.0)',
                                    'count_full': 0,
                                    'sort_key_full': 2450.0,
                                    'count_test': 0.0,
                                    'sort_key_test': 2450.0}, ignore_index=True).sort_values('sort_key_full')
                                 .reindex(['prize_range', 'count_full', 'count_test'], axis=1)
                                 .astype({'count_test': int}))


In [None]:
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.barplot(
    data=plotting_data, x='prize_range', y='count_full',
    color='#0096c7', alpha=0.85, lw=0.5,
    ax=ax,
)
sns.barplot(
    data=plotting_data, x='prize_range', y='count_test',
    color='#f48c06', lw=0,
    ax=ax
)
sns.despine(ax=ax, left=True)
ax.yaxis.grid(True, color='white', alpha=0.5)

ax.set_xticks([i - 0.5 for i in np.linspace(0, 48, 25)])
ax.set_xticklabels([int(i) for i in np.linspace(300, 2700, 25)], ha='center')
# ax.set_xticklabels(ax.get_xticklabels(), rotation=-90, ha='center')

target_width = 1
for idx, p in enumerate(ax.patches):
    orig_width = p.get_width()
    p.set_width(target_width)
    p.set_x(p.get_x() - (target_width - orig_width) / 2)
    
    cnt = p.get_height()
    x, y = p.get_x() + p.get_width() / 2, p.get_height()
    ytext = (y + 5) if idx >= len(plotting_data) else (y + (20 if y < 100 else 5))
    ax.annotate(int(cnt), xy=(x, y), xytext=(x, ytext), ha='center', color='white', alpha=0.85)
    


## 10-Fold Cross Validation Predict

### Cross Validation Strategy

The "Independent and Identically Distributed" assumption that 

> _all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples_

may not hold in the scenario of Topcoder dataset. So the following cross validation strategy will be used to split the training and testing sets.

1. Split the dataset by `top2_prize` as if it's a classification problem. i.e. make sure different prizes are presented in the validation set.
2. Split by `project_id` (assuming that challenges are dependant within each project)
2. Split by `sub_track` (assuming that challenges are dependant within each sub-track)

In [None]:
feature, target = TML.get_training_data()
train_ids, test_ids = TML.get_train_test_index()

feature_train, feature_test = feature.loc[feature.index.isin(train_ids)], feature.loc[feature.index.isin(test_ids)]
target_train, target_test = target.loc[target.index.isin(train_ids)], target.loc[target.index.isin(test_ids)]

The feature columns are composed as follow:
1. $d_0, d_1, d_2 ~ d_5 $: Numeric features `duration`, `num_of_competing_challenges`, `softmax_c1` to `softmax_c4`;
2. $d_6, d_7 $: Categorical features `project_id` and `sub_track`;
3. $d_8 ~ d_107 $: One Hot Encoded tag and tag combination;
4. $d_108 ~ d_207 $: Document vector for challenge description text representation.

In [None]:
feature

### Cross Validation with Training Dataset

In [None]:
target_pred = cross_val_predict(
    TML.construct_training_pipeline(),
    feature_train,
    target_train.to_numpy().reshape(-1),
    cv=10,
)
cv_result = pd.concat([target_train.reset_index(), pd.DataFrame(target_pred)], axis=1).rename(columns={0: 'pred'})
cv_result['mae'] = cv_result['top2_prize'] - cv_result['pred']
cv_result['mre'] = cv_result['mae'].abs() / cv_result['top2_prize']

In [None]:
# MMRE
cv_result['mre'].mean()

In [None]:
cv_result.reindex(['mae', 'mre'], axis=1).describe()

### Test the model using testing set

In [None]:
est = TML.construct_training_pipeline()
est.fit(feature_train, target_train.to_numpy().reshape(-1))
pred = est.predict(feature_test)

test_result = pd.concat([target_test.reset_index(), pd.DataFrame(pred)], axis=1).rename(columns={0: 'pred'})
test_result['mae'] = test_result['top2_prize'] - test_result['pred']
test_result['mre'] = test_result['mae'].abs() / test_result['top2_prize']

In [None]:
## MMRE
test_result['mre'].mean()

In [None]:
test_result.reindex(['mae', 'mre'], axis=1).describe()

In [None]:
impt = pd.DataFrame(est['gbr'].feature_importances_).rename(columns={0: 'importance'}, index=dict(enumerate(feature.columns.tolist())))
impt.sort_values('importance', ascending=False).head(25)

### Use `cross_validate_predict` to get the prediction of all dataset

In [None]:
est_for_all = TML.construct_training_pipeline()
pred = cross_val_predict(est_for_all, feature, target.to_numpy().reshape(-1), cv=10)

all_data_result = pd.concat([target.reset_index(), pd.DataFrame(pred)], axis=1).rename(columns={0: 'pred'})
all_data_result['mae'] = all_data_result['top2_prize'] - all_data_result['pred']
all_data_result['mre'] = all_data_result['mae'].abs() / all_data_result['top2_prize']

In [None]:
# MMRE
all_data_result['mre'].mean()

In [None]:
all_data_result.reindex(['mae', 'mre'], axis=1).describe()