# Concise Repeatable Notebook

# Content
1. [H2O AutoML](#H2O-AutoML)
1. [Stacking](#Stacking)
1. [Blending](#Blending)

In [None]:
!pip install --upgrade scikit-learn

import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Markdown
import scipy.stats as ss
import itertools
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from sklearn.metrics import auc,plot_roc_curve
from catboost import CatBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

import h2o
from h2o.automl import H2OAutoML
h2o.init()

## NOTE

Our best is a blend of H2O AutoML and a stacked Catboost and HistGradientBoostingClassifier. This will be done in two parts.

# H2O AutoML

## Preprocessing

In [None]:
train = pd.read_csv('../tdt05-2021-challenge-2/challenge2_train.csv')
test = pd.read_csv('../tdt05-2021-challenge-2/challenge2_test.csv')

ord_features = ['f1', 'f2', 'f3', 'f5', 'f7', 'f10', 'f13', 'f18', 'f19', 'f27']
num_ords = ['f3', 'f5', 'f7', 'f19', 'f27']
alpha_ords = ['f1', 'f2', 'f10', 'f18']

numeric = ['f11', 'f17', 'f24', 'f28']
bell_curve = ['f11', 'f28']
long_tail = ['f17', 'f24']  # F24 has -1 as null values. Remove before scaling

bin_features = ['f0', 'f4', 'f6', 'f25', 'f26']
cyc_features = ['f16', 'f21']
nom_features = ['f8', 'f9', 'f12', 'f14', 'f15', 'f22', 'f23']  # hexes
duplicate = ['f20']

all_feat=bin_features+nom_features+ord_features+['f16_sin', 'f16_cos', 'f21_sin', 'f21_cos']+numeric

categorical = bin_features+nom_features+ord_features+ ['f1_0', 'f1_1']+['f16_sin', 'f16_cos', 'f21_sin', 'f21_cos']

#Split up the dub
train['f1_0'] = train['f1'].apply(lambda x: x[0] if type(x) is str else x)
test['f1_0'] = test['f1'].apply(lambda x: x[0] if type(x) is str else x)
train['f1_1'] = train['f1'].apply(lambda x: x[1] if type(x) is str else x)
test['f1_1'] = test['f1'].apply(lambda x: x[1] if type(x) is str else x)
train.drop(['f1'], axis=1, inplace=True)
test.drop(['f1'], axis=1, inplace=True)
ord_features.remove('f1')
ord_features.extend(['f1_0', 'f1_1'])

# EHM something

train[['f5']].replace(-1.0, np.nan, inplace=True)
test[['f5']].replace(-1.0, np.nan, inplace=True)

# Convert those alphas which are very indicative

train['f13'] = train['f13'].apply(lambda x: (ord(x) - ord('a'))/14 if pd.notnull(x) else x)
test['f13'] = test['f13'].apply(lambda x: (ord(x) - ord('a'))/14 if pd.notnull(x) else x)
ord_features.remove('f13')

train['f10'] = train['f10'].apply(lambda x: (ord(x) - ord('A'))/25 if pd.notnull(x) else x)
test['f10'] = test['f10'].apply(lambda x: (ord(x) - ord('A'))/25 if pd.notnull(x) else x)
ord_features.remove('f10')


## ChooChoo here comes the number soup
scl = preprocessing.StandardScaler()
train[['f11', 'f28']] = scl.fit_transform(train[['f11', 'f28']])
test[['f11', 'f28']] = scl.transform(test[['f11', 'f28']])
    
train[['f24']].replace(-1.0, np.nan, inplace=True)
test[['f24']].replace(-1.0, np.nan, inplace=True)
train[['f17', 'f24']] = np.log(train[['f17', 'f24']])
test[['f17', 'f24']] = np.log(test[['f17', 'f24']])

scl = preprocessing.StandardScaler()
train[['f17', 'f24']] = scl.fit_transform(train[['f17', 'f24']])
test[['f17', 'f24']] = scl.transform(test[['f17', 'f24']])

# 9 million bicycles in Beijing

train['f16_sin'] = np.sin((train['f16'] - 1) * (2. * np.pi / 12))
train['f16_cos'] = np.cos((train['f16'] - 1) * (2. * np.pi / 12))
test['f16_sin'] = np.sin((test['f16'] - 1) * (2. * np.pi / 12))
test['f16_cos'] = np.cos((test['f16'] - 1) * (2. * np.pi / 12))
train.drop(['f16'], axis=1, inplace=True)
test.drop(['f16'], axis=1, inplace=True)

train['f21_sin'] = np.sin((train['f21'] - 1) * (2. * np.pi / 7))
train['f21_cos'] = np.cos((train['f21'] - 1) * (2. * np.pi / 7))
test['f21_sin'] = np.sin((test['f21'] - 1) * (2. * np.pi / 7))
test['f21_cos'] = np.cos((test['f21'] - 1) * (2. * np.pi / 7))
train.drop(['f21'], axis=1, inplace=True)
test.drop(['f21'], axis=1, inplace=True)
    
train['f16_sin'].fillna(0, inplace=True)
train['f16_cos'].fillna(0, inplace=True)
test['f16_sin'].fillna(0, inplace=True)
test['f16_cos'].fillna(0, inplace=True)

train['f21_sin'].fillna(0, inplace=True)
train['f21_cos'].fillna(0, inplace=True)
test['f21_sin'].fillna(0, inplace=True)
test['f21_cos'].fillna(0, inplace=True)

train['f19'] = train['f19'].apply(lambda x: int(x*10) if pd.notnull(x) else x)
test['f19'] = test['f19'].apply(lambda x: int(x*10) if pd.notnull(x) else x)

target = train['target']
test_id = test['id']
#train.drop(['target', 'id', 'f20'], axis=1, inplace=True)
#test.drop(['id', 'f20'], axis=1, inplace=True)
train.drop(['id', 'f20'], axis=1, inplace=True)
test.drop(['id', 'f20'], axis=1, inplace=True)

categorical = bin_features+nom_features+ord_features
h_train = h2o.H2OFrame(train)
h_test = h2o.H2OFrame(test)

for col in categorical:
    h_train[col] = h_train[col].asfactor()
    h_test[col] = h_test[col].asfactor()


h_train['target'] = h_train['target'].asfactor()

y = "target"
x = h_train.columns
x.remove(y)

## Model

In [None]:
aml = H2OAutoML(max_models = 20, seed = 42)
aml.train(x = x, y = y, training_frame = h_train)
lb = aml.leaderboard
lb.head()

## Predict & save

In [None]:
preds = h2o.as_list(aml.predict(h_test))['p1']

path='../tdt05-2021-challenge-2/'
submission=pd.read_csv(path+'sample_submission.csv')

submission['target'] = preds
submission.to_csv('results/automl.csv', index=None)
submission.head()

# Stacking

## Preprocessing

In [None]:
path='../tdt05-2021-challenge-2/'
train=pd.read_csv(path+'challenge2_train.csv')
test=pd.read_csv(path+'challenge2_test.csv')
submission=pd.read_csv(path+'sample_submission.csv')

ord_features = ['f1', 'f2', 'f3', 'f5', 'f7', 'f10', 'f13', 'f18', 'f19', 'f27']
num_ords = ['f3', 'f5', 'f7', 'f19', 'f27']
alpha_ords = ['f1', 'f2', 'f10', 'f18']

numeric = ['f11', 'f17', 'f24', 'f28']
bell_curve = ['f11', 'f28']
long_tail = ['f17', 'f24']  # F24 has -1 as null values. Remove before scaling

bin_features = ['f0', 'f4', 'f6', 'f25', 'f26']
cyc_features = ['f16', 'f21']
nom_features = ['f8', 'f9', 'f12', 'f14', 'f15', 'f22', 'f23']  # hexes
duplicate = ['f20']


all_feat=bin_features+nom_features+ord_features+['f16_sin', 'f16_cos', 'f21_sin', 'f21_cos']+numeric

# CREDITS : https://www.kaggle.com/caesarlupum/2020-20-lines-target-encoding

def encoding(train, test, smooth):
    print('Target encoding...')
    train.sort_index(inplace=True)
    target = train['target']
    test_id = test['id']
    train.drop(['target', 'id', 'f20'], axis=1, inplace=True)
    test.drop(['id', 'f20'], axis=1, inplace=True)
    
    #Split up the dub
    train['f1_0'] = train['f1'].apply(lambda x: x[0] if type(x) is str else x)
    test['f1_0'] = test['f1'].apply(lambda x: x[0] if type(x) is str else x)
    train['f1_1'] = train['f1'].apply(lambda x: x[1] if type(x) is str else x)
    test['f1_1'] = test['f1'].apply(lambda x: x[1] if type(x) is str else x)
    train.drop(['f1'], axis=1, inplace=True)
    test.drop(['f1'], axis=1, inplace=True)
    ord_features.remove('f1')
    ord_features.extend(['f1_0', 'f1_1'])
    
    train[['f5']].replace(-1.0, np.nan, inplace=True)
    test[['f5']].replace(-1.0, np.nan, inplace=True)
    
    # Convert the alphas which are VERY indicative
    
    train['f13'] = train['f13'].apply(lambda x: (ord(x) - ord('a')) if pd.notnull(x) else x)
    test['f13'] = test['f13'].apply(lambda x: (ord(x) - ord('a')) if pd.notnull(x) else x)
    ord_features.remove('f13')
    
    train['f10'] = train['f10'].apply(lambda x: (ord(x) - ord('A')) if pd.notnull(x) else x)
    test['f10'] = test['f10'].apply(lambda x: (ord(x) - ord('A')) if pd.notnull(x) else x)
    ord_features.remove('f10')
    
    
    ## ChooChoo here comes the number soup
    scl = preprocessing.StandardScaler()
    train[['f11', 'f28']] = scl.fit_transform(train[['f11', 'f28']])
    test[['f11', 'f28']] = scl.transform(test[['f11', 'f28']])
    
    train[['f24']].replace(-1.0, np.nan, inplace=True)
    test[['f24']].replace(-1.0, np.nan, inplace=True)

    train[['f17', 'f24']] = np.log(train[['f17', 'f24']])
    test[['f17', 'f24']] = np.log(test[['f17', 'f24']])

    scl = preprocessing.StandardScaler()
    train[['f17', 'f24']] = scl.fit_transform(train[['f17', 'f24']])
    test[['f17', 'f24']] = scl.transform(test[['f17', 'f24']])
    
    # Push cyclical features onto a unit-circle.
    train['f16_sin'] = np.sin((train['f16'] - 1) * (2. * np.pi / 12))
    train['f16_cos'] = np.cos((train['f16'] - 1) * (2. * np.pi / 12))

    test['f16_sin'] = np.sin((test['f16'] - 1) * (2. * np.pi / 12))
    test['f16_cos'] = np.cos((test['f16'] - 1) * (2. * np.pi / 12))

    train.drop(['f16'], axis=1, inplace=True)
    test.drop(['f16'], axis=1, inplace=True)
    
    train['f21_sin'] = np.sin((train['f21'] - 1) * (2. * np.pi / 7))
    train['f21_cos'] = np.cos((train['f21'] - 1) * (2. * np.pi / 7))

    test['f21_sin'] = np.sin((test['f21'] - 1) * (2. * np.pi / 7))
    test['f21_cos'] = np.cos((test['f21'] - 1) * (2. * np.pi / 7))

    train.drop(['f21'], axis=1, inplace=True)
    test.drop(['f21'], axis=1, inplace=True)
    
    train['f16_sin'].fillna(0, inplace=True)
    train['f16_cos'].fillna(0, inplace=True)
    test['f16_sin'].fillna(0, inplace=True)
    test['f16_cos'].fillna(0, inplace=True)

    train['f21_sin'].fillna(0, inplace=True)
    train['f21_cos'].fillna(0, inplace=True)
    test['f21_sin'].fillna(0, inplace=True)
    test['f21_cos'].fillna(0, inplace=True)
    
    cyc_features = ['f16_sin', 'f16_cos', 'f21_sin', 'f21_cos']
    # ord_features.remove('f16')
    cat_feat = bin_features+nom_features+ord_features+cyc_features
    smoothing=smooth
    oof = pd.DataFrame([])
    
    for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=42, shuffle=True).split(train, target):
        ce_target_encoder = ce.TargetEncoder(cols = cat_feat, smoothing=smoothing)
        ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx])
        oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]), ignore_index=False)
        
    ce_target_encoder = ce.TargetEncoder(cols = cat_feat, smoothing=smoothing)
    ce_target_encoder.fit(train, target)
    train = oof.sort_index()
    test = ce_target_encoder.transform(test)
    print('Done!')
    return train, test, test_id, list(train), target

all_feat.remove('f1')
all_feat.extend(['f1_0', 'f1_1'])

train_encode, test_encode, test_id, features, target = encoding(train, test, 0.3)
train_encode=pd.concat([train_encode,target],axis=1,ignore_index=True)
train_encode.columns=list(train.columns)+['target']

X, y = train_encode[all_feat], train_encode['target']

## Model

In [None]:
def make_kitty():
    clf = CatBoostClassifier(
                               loss_function='CrossEntropy',
                               eval_metric="AUC",
                               task_type="CPU",
                               learning_rate=0.015,
                               n_estimators =2000,
                               early_stopping_rounds=100,
                               random_seed=42,
                               silent=True
                              )
        
    return clf

## Back To Model

In [None]:
scoring = "roc_auc"

HistGBM_param = {
    'l2_regularization': 0.0,
    'loss': 'auto',
    'max_bins': 255,
    'max_depth': 15,
    'max_leaf_nodes': 31,
    'min_samples_leaf': 20,
    'n_iter_no_change': 50,
    'scoring': scoring,
    'tol': 1e-07,
    'validation_fraction': 0.15,
    'verbose': 0,
    'warm_start': False   
}


xgBoost_params = {
    'verbose': 0,
    'scoring':  scoring,
    'eval_metric': ['auc'],
    'objective': 'binary:logistic',
    'learning_rate': 0.15, 
    'max_depth': 2, 
    'subsample': 0.7,
    'min_child_weight': 500, 
    'colsample_bytree': 0.2, 
    'reg_lambda': 3.5, 
    'reg_alpha': 1.5,
    'num_parallel_tree': 5,
    'n_estimators': 200,
    'early_stopping_rounds': 100
}


folds = StratifiedKFold(n_splits=7, shuffle=True, random_state=1)
fold_preds = np.zeros([test_encode.shape[0],3])
oof_preds = np.zeros([X.shape[0],3])
results = {}

estimators = [
        ('histgbm', HistGradientBoostingClassifier(**HistGBM_param)),
        ('catboost', make_kitty())
        # ('xgboost', XGBClassifier(**xgBoost_params))
    ]

# Fit Folds
for i, (trn_idx, val_idx) in enumerate(folds.split(X,y)):
    print(f"Fold {i} stacking....")
    clf = StackingClassifier(
            estimators=estimators,
            final_estimator=LogisticRegression(),
            )
    clf.fit(X.loc[trn_idx,:], y.loc[trn_idx])
    tmp_pred = clf.predict_proba(X.loc[val_idx,:])[:,1]
    
    oof_preds[val_idx,0] = tmp_pred
    fold_preds[:,0] += clf.predict_proba(test_encode)[:,1] / folds.n_splits
        
    estimator_performance = {}
    estimator_performance['stack_score'] = metrics.roc_auc_score(y.loc[val_idx], tmp_pred)
    
    for ii, est in enumerate(estimators):
            model = clf.named_estimators_[est[0]]
            pred = model.predict_proba(X.loc[val_idx,:])[:,1]
            oof_preds[val_idx, ii+1] = pred
            fold_preds[:,ii+1] += model.predict_proba(test_encode)[:,1] / folds.n_splits
            estimator_performance[est[0]+"_score"] = metrics.roc_auc_score(y.loc[val_idx], pred)
            
    stack_coefficients = {x+"_coefficient":y for (x,y) in zip([x[0] for x in estimators], clf.final_estimator_.coef_[0])}
    stack_coefficients['intercept'] = clf.final_estimator_.intercept_[0]
        
    results["Fold {}".format(str(i+1))] = [
            estimator_performance,
            stack_coefficients
        ]
print('Done!')

In [None]:
results['Fold 3']

In [None]:
raise ValueError()
submission['target'] =fold_preds[:,0]
submission.to_csv('results/all_feats_13_and_11_asnum_hyp_cat_cyclicals_nans_fixed_7_folds.csv', index=None)
submission.head()

# Blending

In [None]:
submission=pd.read_csv(path+'sample_submission.csv')
own = pd.read_csv('results/all_feats_13_and_11_asnum_hyp_cat_cyclicals_nans_fixed_7_folds.csv')
aml = pd.read_csv('results/automl.csv')
submission['target'] = (aml.target*0.76772 + own.target*.76911)/(.76772+.76911)
submission.to_csv('results/aml_own_equeal_weighted_blend.csv', index=None)
submission.head()