In [3]:
import numpy as np
import pandas as pd

import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


import lightgbm as lgb

In [4]:
DATA_PATH = 'data'

In [5]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))

In [4]:
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
X = train.drop(['ID_code', 'target'], axis=1)
y = train['target']

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
xgb1 = XGBClassifier(
                    learning_rate=0.1,
                    max_depth=5,
                    min_child_weight=5,
                    gamma=0,
                    reg_alpha=1e-5,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=42)

xgb1.fit(x_train, y_train)
predictions = xgb1.predict(x_valid)

acc = accuracy_score(y_valid, predictions)
auc = roc_auc_score(y_valid, predictions)

print(f'Accuracy: {acc}')
print(f'ROC AUC: {auc}')

Accuracy: 0.9042
ROC AUC: 0.5279128260863554


In [13]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

features = train.columns[2:202]

In [14]:
target = train['target']
train = train.drop(['ID_code', 'target'], axis=1)

KeyError: 'target'

In [11]:
test = test.drop(['ID_code'], axis=1)

In [15]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=42)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=1000,
                    early_stopping_rounds = 3000)
    
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))



Fold 0
[LightGBM] [Info] Number of positive: 18089, number of negative: 161911
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50490
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 198
Training until validation scores don't improve for 3000 rounds
[1000]	training's auc: 0.884909	valid_1's auc: 0.864857
[2000]	training's auc: 0.902188	valid_1's auc: 0.878752
[3000]	training's auc: 0.913093	valid_1's auc: 0.886563
[4000]	training's auc: 0.920734	valid_1's auc: 0.890503
[5000]	training's auc: 0.926516	valid_1's auc: 0.892856
[6000]	training's auc: 0.93145	valid_1's auc: 0.894217
[7000]	training's auc: 0.935714	valid_1's auc: 0.895184
[8000]	training's auc: 0.939662	valid_1's auc: 0.895705
[9000]	training's auc: 0.943437	valid_1's auc: 0.89586
[10000]	training's auc: 0.946945	valid_1's auc: 0.896194
[11000]	training's auc: 0.950287	valid_1's auc: 0.896344
[12000]	training's auc: 0.953444	valid_1's auc: 0.8964

Fold 6
[LightGBM] [Info] Number of positive: 18088, number of negative: 161912
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50490
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 198
Training until validation scores don't improve for 3000 rounds
[1000]	training's auc: 0.885076	valid_1's auc: 0.8696
[2000]	training's auc: 0.902166	valid_1's auc: 0.883144
[3000]	training's auc: 0.913056	valid_1's auc: 0.889594
[4000]	training's auc: 0.92059	valid_1's auc: 0.893218
[5000]	training's auc: 0.926396	valid_1's auc: 0.894879
[6000]	training's auc: 0.931258	valid_1's auc: 0.896244
[7000]	training's auc: 0.935556	valid_1's auc: 0.896785
[8000]	training's auc: 0.939486	valid_1's auc: 0.89719
[9000]	training's auc: 0.943244	valid_1's auc: 0.897479
[10000]	training's auc: 0.946789	valid_1's auc: 0.897412
[11000]	training's auc: 0.950085	valid_1's auc: 0.897405
[12000]	training's auc: 0.953343	valid_1's auc: 0.897642

In [18]:
save_to = 'data/lgb.txt'
clf.save_model(save_to)

<lightgbm.basic.Booster at 0x22317b3ccc8>

In [19]:
sub_df = pd.DataFrame({"ID_code":sub["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("data/submission.csv", index=False)