In [39]:
import numpy as np 
import pandas as pd

from lightgbm import LGBMClassifier,cv, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import string
from category_encoders import TargetEncoder


In [40]:
df_train = pd.read_csv("train.csv",index_col=0)
df_test = pd.read_csv("test.csv",index_col=0)

In [41]:
ord_1_map  = dict(Novice=0,Contributor=1,Expert=2,Master=3,Grandmaster=4)
ord_2_map = dict(Freezing=0,Cold=1,Warm=2,Hot=3)
ord_2_map.update({'Boiling Hot':4,'Lava Hot':5})
ord_3_lookup = {j:i for (i,j) in zip(range(len(string.ascii_lowercase)),string.ascii_lowercase)}
ord_4_lookup = {j:i for (i,j) in zip(range(len(string.ascii_lowercase)),string.ascii_uppercase)}

df_train.ord_1 = df_train.ord_1.map(ord_1_map)
df_train.ord_2 = df_train.ord_2.map(ord_2_map)
df_train.ord_3 = df_train.ord_3.map(ord_3_lookup)
df_train.ord_4 = df_train.ord_4.map(ord_4_lookup)

df_test.ord_1 = df_test.ord_1.map(ord_1_map)
df_test.ord_2 = df_test.ord_2.map(ord_2_map)
df_test.ord_3 = df_test.ord_3.map(ord_3_lookup)
df_test.ord_4 = df_test.ord_4.map(ord_4_lookup)
ords = ['ord_1','ord_2','ord_3','ord_4']
df_train[ords] = df_train[ords].astype("O")
df_test[ords] = df_test[ords].astype("O")

In [52]:
len(df_train.columns)

24

In [51]:
len(df_test.columns)

23

In [54]:
X = df_train.drop(["target"],axis=1)
y = df_train.target
for col in X.select_dtypes("O"):
    X[col] = X[col].astype('category')

nom_cols = [f'nom_{i}' for i in range(4,10)]   
enc = TargetEncoder(cols=nom_cols).fit(X, y)
df_train = enc.transform(X)
df_test = enc.transform(df_test)

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7)

In [55]:
params = {'learning_rate': 0.05,
          'metric': 'auc',
          'n_jobs': -1,
          'n_estimators' : 1000,
          'feature_fraction_seed': 0,
          'bagging_seed': 0,
          'boosting_type': 'gbdt',
          'verbose': 1,
          'is_unbalance': True}
estimated_params = {'colsample_bytree': 0.6377497065362651, 'min_child_samples': 323, 'min_child_weight': 0.1, 'num_leaves': 10, 'reg_alpha': 100, 'reg_lambda': 0.1, 'subsample': 0.9071147321063562}
model = LGBMClassifier(**params,**estimated_params)

In [56]:
%%time
model.fit(X_train,y_train,eval_metric='AUC',eval_set = [(X_test, y_test)],
                  verbose = 1000,
                  early_stopping_rounds = 1000)

Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's auc: 0.793794
Did not meet early stopping. Best iteration is:
[600]	valid_0's auc: 0.794302
CPU times: user 3min 28s, sys: 431 ms, total: 3min 29s
Wall time: 29.9 s


LGBMClassifier(bagging_seed=0, boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.6377497065362651, feature_fraction_seed=0,
               importance_type='split', is_unbalance=True, learning_rate=0.05,
               max_depth=-1, metric='auc', min_child_samples=323,
               min_child_weight=0.1, min_split_gain=0.0, n_estimators=1000,
               n_jobs=-1, num_leaves=10, objective=None, random_state=None,
               reg_alpha=100, reg_lambda=0.1, silent=True,
               subsample=0.9071147321063562, subsample_for_bin=200000,
               subsample_freq=0, verbose=1)

In [None]:
# X = Dataset(X_train,y_train)
# X_eval = Dataset(X_test, y_test)

In [None]:
# cv(params,train_set=X,metrics='AUC',early_stopping_rounds = 1000)

In [57]:
preds = model.predict_proba(X_test)
roc_auc_score(y_test, preds[:, 1])

0.7943018217067366

# Define Random Grid Search

In [61]:
# https://www.kaggle.com/mlisovyi/lightgbm-hyperparameter-optimisation-lb-0-761

In [51]:
# from scipy.stats import randint as sp_randint
# from scipy.stats import uniform as sp_uniform
# param_test ={'num_leaves': sp_randint(6, 50), 
#              'min_child_samples': sp_randint(100, 500), 
#              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#              'subsample': sp_uniform(loc=0.2, scale=0.8), 
#              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
#              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [56]:
# fit_params={"early_stopping_rounds":30, 
#             "eval_metric" : 'auc', 
#             "eval_set" : [(X_test,y_test)],
#             'eval_names': ['valid'],
#             #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
#             'verbose': 100,
#             'categorical_feature': 'auto',
#            'is_unbalance':True}

In [57]:
# n_HP_points_to_test = 100

# import lightgbm as lgb
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# #n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
# clf = lgb.LGBMClassifier(max_depth=-1, random_state=0, silent=True, metric='None', n_jobs=4, n_estimators=5000)
# gs = RandomizedSearchCV(
#     estimator=clf, param_distributions=param_test, 
#     n_iter=n_HP_points_to_test,
#     scoring='roc_auc',
#     cv=3,
#     refit=True,
#     random_state=314,
#     verbose=True)

In [59]:
# %%time
# gs.fit(X_train,y_train,**fit_params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767017
Early stopping, best iteration is:
[88]	valid's auc: 0.767147
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.76791
Early stopping, best iteration is:
[101]	valid's auc: 0.767951
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767614
Early stopping, best iteration is:
[105]	valid's auc: 0.767696
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.765044
Early stopping, best iteration is:
[83]	valid's auc: 0.765439
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767362
Early stopping, best iteration is:
[105]	valid's auc: 0.767514
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.766355
Early stopping, best iteration is:
[91]	valid's auc: 0.766469
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.726104
[20

[100]	valid's auc: 0.770642
[200]	valid's auc: 0.772759
Early stopping, best iteration is:
[212]	valid's auc: 0.772848
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767487
Early stopping, best iteration is:
[99]	valid's auc: 0.767546
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.768808
Early stopping, best iteration is:
[99]	valid's auc: 0.768823
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767404
Early stopping, best iteration is:
[96]	valid's auc: 0.767428
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.771201
[200]	valid's auc: 0.773243
Early stopping, best iteration is:
[171]	valid's auc: 0.773397
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.771659
[200]	valid's auc: 0.77385
Early stopping, best iteration is:
[171]	valid's auc: 0.774034
Training until validation scores don't improve for 30 rounds
[100]	va

[100]	valid's auc: 0.772058
Early stopping, best iteration is:
[150]	valid's auc: 0.773579
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.772636
[200]	valid's auc: 0.77444
Early stopping, best iteration is:
[178]	valid's auc: 0.774717
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.772231
Early stopping, best iteration is:
[162]	valid's auc: 0.773913
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.763386
Early stopping, best iteration is:
[96]	valid's auc: 0.763811
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.764264
Early stopping, best iteration is:
[98]	valid's auc: 0.764502
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.76358
Early stopping, best iteration is:
[98]	valid's auc: 0.763732
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.764072
Early stopping, best iteration is:
[8

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.772085
[200]	valid's auc: 0.775465
Early stopping, best iteration is:
[228]	valid's auc: 0.77561
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[66]	valid's auc: 0.758485
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[68]	valid's auc: 0.760071
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.758107
Early stopping, best iteration is:
[76]	valid's auc: 0.759034
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.768852
Early stopping, best iteration is:
[129]	valid's auc: 0.769787
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.769868
Early stopping, best iteration is:
[129]	valid's auc: 0.770615
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.769268
Early stopping, best iteration 

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.768938
Early stopping, best iteration is:
[111]	valid's auc: 0.769138
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.770841
Early stopping, best iteration is:
[169]	valid's auc: 0.773437
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.770912
[200]	valid's auc: 0.773837
Early stopping, best iteration is:
[232]	valid's auc: 0.774233
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.770801
[200]	valid's auc: 0.773861
Early stopping, best iteration is:
[230]	valid's auc: 0.77392
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767593
Early stopping, best iteration is:
[104]	valid's auc: 0.767737
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767679
Early stopping, best iteration is:
[100]	valid's auc: 0.767679
Training until validation scores 

[100]	valid's auc: 0.727053
[200]	valid's auc: 0.732677
[300]	valid's auc: 0.734212
[400]	valid's auc: 0.734887
[500]	valid's auc: 0.7352
[600]	valid's auc: 0.735377
[700]	valid's auc: 0.735493
[800]	valid's auc: 0.73556
Early stopping, best iteration is:
[782]	valid's auc: 0.735571
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.769713
Early stopping, best iteration is:
[147]	valid's auc: 0.770202
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.770006
Early stopping, best iteration is:
[132]	valid's auc: 0.770604
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.769945
Early stopping, best iteration is:
[143]	valid's auc: 0.770362
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.770714
Early stopping, best iteration is:
[111]	valid's auc: 0.771082
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.771923
Early stopping, bes

Early stopping, best iteration is:
[97]	valid's auc: 0.767432
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.766128
Early stopping, best iteration is:
[96]	valid's auc: 0.766282
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.767979
Early stopping, best iteration is:
[98]	valid's auc: 0.768074
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.768596
Early stopping, best iteration is:
[125]	valid's auc: 0.769057
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.768602
Early stopping, best iteration is:
[106]	valid's auc: 0.768663
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.770307
Early stopping, best iteration is:
[131]	valid's auc: 0.770805
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.77007
Early stopping, best iteration is:
[131]	valid's auc: 0.77123
Training until validation scor

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 43.4min finished


Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.77272
[200]	valid's auc: 0.778829
[300]	valid's auc: 0.780034
Early stopping, best iteration is:
[316]	valid's auc: 0.780164
CPU times: user 2h 40min 24s, sys: 36.1 s, total: 2h 41min
Wall time: 43min 38s


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            metric='None', min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=5000, n_jobs=4,
                                            num_leaves=31, objective=None,
                                            random_state=0, reg_alpha=0.0,
                                            reg_lamb...
                                                             10000.0],
                                        'num_leaves': 

In [66]:
# print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))
# with open("lightgbm.txt","w") as f:
#     f.write(f"{gs.best_params_}")

Best score reached: 0.7798876583558224 with params: {'colsample_bytree': 0.6377497065362651, 'min_child_samples': 323, 'min_child_weight': 0.1, 'num_leaves': 10, 'reg_alpha': 100, 'reg_lambda': 0.1, 'subsample': 0.9071147321063562} 


In [58]:
for col in df_test.select_dtypes("O"):
    df_test[col] = df_test[col].astype("category")
sub = pd.read_csv("sample_submission.csv")
preds= model.predict_proba(df_test)
sub["target"] = preds[:, 1]
sub.head()

Unnamed: 0,id,target
0,600000,0.771439
1,600001,0.529288
2,600002,0.448376
3,600003,0.326631
4,600004,0.516396


In [44]:
sub.to_csv("preds/light.csv",index=False)