In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np

from datetime import datetime
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
train_df =pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
train=train_df.drop("ID_code",axis=1)
test=test_df.drop("ID_code",axis=1)

In [4]:
train.shape

(200000, 201)

In [5]:
# data augmentation 함수 지정


def augment(train,num_n=1,num_p=2):
    newtrain=[train]
    
    n=train[train.target==0]
    for i in range(num_n):
        newtrain.append(n.apply(lambda x:x.values.take(np.random.permutation(len(n)))))
    
    for i in range(num_p):
        p=train[train.target>0]
        newtrain.append(p.apply(lambda x:x.values.take(np.random.permutation(len(p)))))
    return pd.concat(newtrain)
#df=oversample(train,2,1)

In [6]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [None]:
result=np.zeros(test.shape[0])

#StratifiedKFold로 나누되, 해당 나눔을 5번 반복함
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5,random_state=10)


for counter,(train_index, valid_index) in enumerate(rskf.split(train, train.target),1):
    print (counter)
    
    #Train data만 autmentation 함
    t=train.iloc[train_index]
    t=augment(t)
    trn_data = lgb.Dataset(t.drop("target",axis=1), label=t.target)
    
    #Validation data
    v=train.iloc[valid_index]
    val_data = lgb.Dataset(v.drop("target",axis=1), label=v.target)
    
    #Training
    model = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 4000)
    result +=model.predict(test)

1
Training until validation scores don't improve for 4000 rounds.
[500]	training's auc: 0.887224	valid_1's auc: 0.875572
[1000]	training's auc: 0.893982	valid_1's auc: 0.882067
[1500]	training's auc: 0.898379	valid_1's auc: 0.885747
[2000]	training's auc: 0.901865	valid_1's auc: 0.888522
[2500]	training's auc: 0.904753	valid_1's auc: 0.890978
[3000]	training's auc: 0.90695	valid_1's auc: 0.89243
[3500]	training's auc: 0.908953	valid_1's auc: 0.893811
[4000]	training's auc: 0.910685	valid_1's auc: 0.89503
[4500]	training's auc: 0.912222	valid_1's auc: 0.895989
[5000]	training's auc: 0.913683	valid_1's auc: 0.89686
[5500]	training's auc: 0.914966	valid_1's auc: 0.897481
[6000]	training's auc: 0.916197	valid_1's auc: 0.898015
[6500]	training's auc: 0.917361	valid_1's auc: 0.898416
[7000]	training's auc: 0.918464	valid_1's auc: 0.898789
[7500]	training's auc: 0.919502	valid_1's auc: 0.899035
[8000]	training's auc: 0.920508	valid_1's auc: 0.899422
[8500]	training's auc: 0.921481	valid_1's a

[17000]	training's auc: 0.935907	valid_1's auc: 0.896881
[17500]	training's auc: 0.936661	valid_1's auc: 0.896895


In [None]:
submission = pd.read_csv('../submission/sample_submission.csv')
submission['target'] = result/counter
filename="../submission/{:%Y-%m-%d_%H_%M}_sub.csv".format(datetime.now())
submission.to_csv(filename, index=False)