In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import MissingIndicator, SimpleImputer, IterativeImputer, KNNImputer

# from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from bayes_opt import BayesianOptimization
import lightgbm as lgb

import pandas as pd
import numpy as np
import re
import warnings
warnings.simplefilter('ignore')

In [14]:
TARGET = 'target'
init_round=15
opt_round= 15
n_folds=5
random_state = 0 
n_estimators = 10000
learning_rate=0.01

In [15]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [16]:
test["target"] = -1

In [17]:
data = pd.concat([train, test]).reset_index(drop=True)

In [18]:
data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [19]:
data['null'] = data.isna().sum(axis=1)

In [20]:
sparse_features = [feat for feat in train.columns if feat not in ['id','target']]

In [21]:
for col in sparse_features:
    train_unique_values = set(train[col].dropna().unique())
    test_unique_values  = set(test[col].dropna().unique())

    symmetric_difference_values = train_unique_values.symmetric_difference(test_unique_values)
    if symmetric_difference_values:
        print(f'{len(symmetric_difference_values)} values in {col}, {symmetric_difference_values} Replaced with nan')
        data.loc[data[col].isin(symmetric_difference_values), col] = np.nan

1 values in nom_5, {'b3ad70fcb'} Replaced with nan
4 values in nom_6, {'ee6983c6d', 'a885aacec', '3a121fefb', 'f0732a795'} Replaced with nan
2 values in nom_9, {'3d19cd31d', '1065f10dd'} Replaced with nan


In [22]:
missing_indicators = MissingIndicator(sparse=False).fit_transform(data[sparse_features]).astype(np.int8)

In [23]:
missing_indicator_cols = [feat+'_ind' for feat in sparse_features]
for col in missing_indicator_cols:
    data[col] = 0
    data[col] = data[col].astype(np.uint8)
data[missing_indicator_cols] = MissingIndicator(sparse=False).fit_transform(data[sparse_features]).astype(np.int8)

In [24]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat].fillna('-1',).astype(str).values)

In [25]:
train = data[data.target != -1].reset_index(drop=True)
test  = data[data.target == -1].reset_index(drop=True)

In [26]:
y = train.target
X_train = train.drop(['id', 'target'], 1)

In [27]:
train_data = lgb.Dataset(data=X_train, label=y, categorical_feature = X_train.columns.tolist(), free_raw_data=False)

In [28]:
def lgb_eval(num_leaves, learning_rate, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'objective':'binary','num_iterations':15000, 'early_stopping_round':100, 'metric':'auc'}#n_estimator
    params["num_leaves"] = round(int(num_leaves))
    params["learning_rate"] = learning_rate
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(int(max_depth))
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_state, stratified=True, verbose_eval =200)
    return max(cv_result['auc-mean'])

In [29]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 300),
                                        'learning_rate':(0.01, 0.05),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=random_state)

In [30]:
opt_params = lgbBO.maximize(init_points=init_round, n_iter=opt_round)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's auc: 0.773421 + 0.00225517
[400]	cv_agg's auc: 0.776971 + 0.00225604
| [0m 1       [0m | [0m 0.7771  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 0.02695 [0m | [0m 7.577   [0m | [0m 24.69   [0m | [0m 0.08929 [0m | [0m 290.0   [0m |
[200]	cv_agg's auc: 0.776771 + 0.00187082
[400]	cv_agg's auc: 0.778337 + 0.00193757
| [95m 2       [0m | [95m 0.7786  [0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 2.644   [0m | [95m 1.704   [0m | [95m 0.04702 [0m | [95m 5.283   [0m | [95m 8.921   [0m | [95m 0.003002[0m | [95m 253.8   [0m |
[200]	cv_agg's auc: 0.773623 + 0.00219276
[400]	cv_agg's auc: 0.775382 + 0.00220995
| [0m 3       [0m | [0

In [35]:
#|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
#|  29       |  0.7855   |  0.8264   |  0.102    |  4.948    |  0.06873  |  0.02407  |  5.608    |  47.94    |  0.08549  |  24.23    |

In [36]:
# for making train - valid sets
from sklearn.model_selection import train_test_split

#Split in 80% train and 20% test set
train_df, val_df = train_test_split(train, test_size = 0.2, random_state=random_state)

#Define 'y' labels
train_y = train_df.target
val_y = val_df.target

#Define 'x' sets
train_x = train_df.drop(['id','target'], axis = 1)
val_x = val_df.drop(['id','target'], axis = 1)

#categorical_features = [col for c, col in enumerate(train_x.columns) \
#                        if not ( np.issubdtype(train_x.dtypes[c], np.number )  )  ]

for f in train_x.columns.tolist():
    train_x[f] = train_x[f].astype('category')
    val_x[f] = val_x[f].astype('category')

In [37]:
train_data = lgb.Dataset(data=train_x, label=train_y, categorical_feature = train_x.columns.tolist(), free_raw_data=False)
val_data = lgb.Dataset(data=val_x, label=val_y, categorical_feature = train_x.columns.tolist(), free_raw_data=False)

In [38]:
params = {'objective':'binary',
        'num_iterations':15000, 
        'early_stopping_round':100, 
        'metric':'auc',
        'num_leaves': round(int(24.23)),
        'learning_rate':0.02407,
        'feature_fraction': 0.102,
        'bagging_fraction': 0.8264,
        'max_depth': round(int(5.608)),
        'lambda_l1': 4.948,
        'lambda_l2': 0.06873,
        'min_split_gain': 0.08549,
        'min_child_weight': 47.9,
        'seed':random_state}

In [39]:
lgbm = lgb.train(params,
                 train_data,
                 num_boost_round=40000,
                 valid_sets=val_data,
                 early_stopping_rounds=100,
                 verbose_eval=200,
                 )

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.775504
[400]	valid_0's auc: 0.781278
[600]	valid_0's auc: 0.782625
[800]	valid_0's auc: 0.783519
[1000]	valid_0's auc: 0.783979
[1200]	valid_0's auc: 0.78445
[1400]	valid_0's auc: 0.784699
[1600]	valid_0's auc: 0.784865
[1800]	valid_0's auc: 0.784954
[2000]	valid_0's auc: 0.785075
[2200]	valid_0's auc: 0.785207
Early stopping, best iteration is:
[2263]	valid_0's auc: 0.785276
