In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
from optuna.integration import OptunaSearchCV
from optuna.distributions import *
import operator
from functools import reduce

In [2]:
import sys
sys.path.append('../../../src')
from pipeline_utils import LGBMClassifierEarlyStopping, auc

In [3]:
data_dir = '../data'
RANDOM_STATE = 2021

In [4]:
df = pd.read_csv(f"{data_dir}/interim/train_for_David.csv", index_col='id')
# df = pd.read_parquet(f"{data_dir}/interim/train.parq", engine='pyarrow').convert_dtypes()
display(df.shape)
df.head(2)

(300000, 31)

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,9,1,2,2,113,1,19,17,1,...,0.883814,1.282606,0.450056,0.332458,0.570031,1.106408,1.581648,2.364865,0.908369,0
1,1,9,1,1,5,113,11,23,56,6,...,-0.356673,0.039954,-0.137271,-0.464434,0.623672,-0.098985,-0.074612,1.923983,0.249945,0


In [5]:
numeric_features = [col for col in df.columns if col.startswith('cont')]
categorical_features = [col for col in df.columns if col.startswith('cat')]

In [None]:
def feature_engineering(df):
    numeric_features = [col for col in df.columns if col.startswith('cont')]
    categorical_features = [col for col in df.columns if col.startswith('cat')]
    df[numeric_features] = df[numeric_features].astype('float')
    df[categorical_features] = df[categorical_features].apply(lambda i: [sum(map(ord, x)) for x in i], axis='rows').astype('int')
    df['gt_0.1'] = df[numeric_features].apply(lambda x: len([i for i in x if i > 0.1]), axis=1)
    df['gt_0.5'] = df[numeric_features].apply(lambda x: len([i for i in x if i > 0.5]), axis=1)
    df['mul_gt_o.1'] = 0
    df.loc[(df['gt_0.1'] >=1), 'mul_gt_o.1'] = df[numeric_features].apply(lambda x: reduce(operator.mul, x), axis=1)
    return df

In [6]:
df['target'] = df['target'].astype('int')
# df[categorical_features] = df[categorical_features].apply(lambda x: x.cat.codes).astype('int').astype('category')

In [7]:
X = df.copy()
y = X.pop('target')

In [None]:
X = feature_engineering(X)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=RANDOM_STATE,
)

Numeric Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler()),
])

Categorical Pipeline

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self, X):
        output = []
        for x in X:
            output.append(LabelEncoder().fit_transform(x))
        return np.array(output)

    def fit_transform(self,X,y=None):
        return self.fit(X, y).transform(X)


In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
#     ('encoder', MultiColumnLabelEncoder())
])

Preprocess Pipeline   
- merge cateogrical & numeric into one pipeline 

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
          ('num', numeric_transformer, X.columns),
#         ('cat', categorical_transformer, categorical_features),
#         ('num', numeric_transformer, numeric_features),
    ]
)

Pipeline   
- merge preprocess & model into one pipeline

In [9]:
pipeline = Pipeline([
#     ('preprocessor', preprocessor),
    ('model', LGBMClassifier(
#         categorical_feature=list(range(len(categorical_features))),
#         early_stopping_rounds=300,
#         test_size=0.2,
#         eval_metric='auc',
#         objective='binary',
        random_state=RANDOM_STATE,
    )),
])

Parameters for `OptunaSearchCV`

In [10]:
parameters = {
#     "model__is_unbalance": CategoricalDistribution([True, False]),
    "model__objective": CategoricalDistribution(["binary"]),
    "model__metric": CategoricalDistribution(["auc"]),
    "model__learning_rate": LogUniformDistribution(1e-3, 1.0),
    'model__n_estimators': CategoricalDistribution(range(2000, 5001, 500)),
    'model__reg_alpha': LogUniformDistribution(1e-3, 10.0),
    'model__reg_lambda': LogUniformDistribution(1e-3, 10.0),
    'model__colsample_bytree': CategoricalDistribution(np.arange(0.1, 1.01, 0.1)),
    'model__subsample': CategoricalDistribution(np.arange(0.1, 1.01, 0.1)),
    'model__subsample_freq': IntUniformDistribution(1, 10),
    'model__max_depth': IntUniformDistribution(1, 32),
    'model__num_leaves' :  IntUniformDistribution(2, 256),
    'model__min_child_samples': IntUniformDistribution(1, 256),
    'model__cat_smooth' : IntUniformDistribution(1, 128),
    'model__max_bin' : IntUniformDistribution(512, 2048),
    'model__cat_l2': IntUniformDistribution(1, 32),
}

In [11]:
# parameters = {'model__is_unbalance': [False], 'model__objective': ['binary'], 'model__metric': ['auc'], 'model__learning_rate': [0.08], 'model__n_estimators': [4000], 'model__reg_alpha': [6.25], 'model__reg_lambda': [0.025], 'model__colsample_bytree': [0.2], 'model__subsample': [0.8], 'model__subsample_freq': [10], 'model__max_depth': [16], 'model__num_leaves': [128], 'model__min_child_samples': [100], 'model__cat_smooth': [88], 'model__max_bin': [666], 'model__cat_l2': [20]}

In [12]:
grid_search = OptunaSearchCV(
    pipeline,
    param_distributions=parameters,
    cv=5,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    scoring=auc,
)

  import sys


In [13]:
grid_search.fit(X_train, y_train, model__categorical_feature=list(range(len(categorical_features))))

[32m[I 2021-03-11 22:21:36,979][0m A new study created in memory with name: no-name-7dfbbf57-b113-4292-a545-7362e9a7804e[0m
[32m[I 2021-03-11 22:58:27,968][0m Trial 1 finished with value: 0.8931240338097627 and parameters: {'model__objective': 'binary', 'model__metric': 'auc', 'model__learning_rate': 0.0034807501074447154, 'model__n_estimators': 2000, 'model__reg_alpha': 0.11451044872325485, 'model__reg_lambda': 0.005873518790253586, 'model__colsample_bytree': 0.8, 'model__subsample': 0.30000000000000004, 'model__subsample_freq': 10, 'model__max_depth': 12, 'model__num_leaves': 106, 'model__min_child_samples': 242, 'model__cat_smooth': 54, 'model__max_bin': 525, 'model__cat_l2': 3}. Best is trial 1 with value: 0.8931240338097627.[0m
[32m[I 2021-03-11 23:06:10,679][0m Trial 0 finished with value: 0.8911587508293739 and parameters: {'model__objective': 'binary', 'model__metric': 'auc', 'model__learning_rate': 0.0025587654298097093, 'model__n_estimators': 2000, 'model__reg_alpha':

OptunaSearchCV(estimator=Pipeline(steps=[('model',
                                          LGBMClassifier(random_state=2021))]),
               n_jobs=-1,
               param_distributions={'model__cat_l2': IntUniformDistribution(high=32, low=1, step=1),
                                    'model__cat_smooth': IntUniformDistribution(high=128, low=1, step=1),
                                    'model__colsample_bytree': CategoricalDistribution(choices=(0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000...
                                    'model__reg_alpha': LogUniformDistribution(high=10.0, low=0.001),
                                    'model__reg_lambda': LogUniformDistribution(high=10.0, low=0.001),
                                    'model__subsample': CategoricalDistribution(choices=(0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0)),
                                    'model__subsample_freq': IntUniformDistribution(high=10, low=1, step=1)

In [14]:
# preds = grid_search.best_estimator_.predict(X_valid)
preds = grid_search.best_estimator_.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, preds)

0.8956510394086716

In [15]:
# onehot + variance_drop 0.8915464060319749
# onehot 0.8970355121797735
# codes 0.8826400343415913
# with label encoder + categorical_feature 0.8843085347397801
# with label codes + categorical_feature 0.8895766476594142

In [16]:
abs(grid_search.best_score_)

0.8954349581329943

In [17]:
grid_search.best_params_

{'model__objective': 'binary',
 'model__metric': 'auc',
 'model__learning_rate': 0.009542742688564705,
 'model__n_estimators': 5000,
 'model__reg_alpha': 0.03710762531461036,
 'model__reg_lambda': 0.0526062108001846,
 'model__colsample_bytree': 0.2,
 'model__subsample': 0.8,
 'model__subsample_freq': 8,
 'model__max_depth': 8,
 'model__num_leaves': 223,
 'model__min_child_samples': 249,
 'model__cat_smooth': 73,
 'model__max_bin': 1008,
 'model__cat_l2': 12}

### Submission

In [18]:
X_test = pd.read_csv(f"{data_dir}/interim/test_for_David.csv", index_col='id')
# X_test =pd.read_parquet(f"{data_dir}/interim/test.parq", engine='pyarrow').convert_dtypes()
display(X_test.shape)
X_test.head(2)

(200000, 30)

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,1,6,1,1,6,113,1,60,76,1,...,0.947223,0.26511,1.034602,-1.278325,-0.512837,0.250766,0.690269,0.619427,-0.174331,1.589866
6,1,8,3,1,5,54,9,6,14,1,...,-0.715314,1.919785,0.025815,0.15478,1.471985,-0.853928,0.688298,-0.897307,0.582567,-0.500923


In [None]:
X_test = feature_engineering(X_test)
# X_test[categorical_features] = X_test[categorical_features].apply(lambda x: x.cat.codes).astype('int').astype('category')

In [19]:
preds_test = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

In [20]:
output = pd.DataFrame(
    {'Id': X_test.index, 'target': preds_test})
output.to_csv(f"{data_dir}/processed/submission.csv", index=False)