Big thanks to:
* Jiwei Liu for [Augment insight](https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment)
* [Ole Morten Grodås, Lightgbm with data augmentation](https://www.kaggle.com/omgrodas/lightgbm-with-data-augmentation)


In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
train=pd.read_csv("../input/train.csv").drop("ID_code",axis=1)
test=pd.read_csv("../input/test.csv").drop("ID_code",axis=1)

In [3]:
# use sample instead of all data
sample = 0
if(sample == 1):
    sample_train = train[train.target == 0].sample(n = 4, random_state = 1573456)
    sample_train_2 = train[train.target == 1].sample(n = 3, random_state = 1573456)
    sample_train = sample_train.append(sample_train_2)
    sample_y = sample_train['target']
    sample_X = sample_train.drop(['target'], axis = 1)
    
    train = sample_train
    y = sample_y
    X = sample_X

In [4]:
# Augmentation. Inspiration from [2]
def augment(train, num_n = 1, num_p = 2):
    newtrain=[train]
    
    n = train[train.target == 0]
    for i in range(num_n):
        newtrain.append( n.apply( lambda x:x.values.take(np.random.permutation(len(n))) ) )
    
    p = train[train.target == 1]
    for i in range(num_p):
        newtrain.append( p.apply( lambda x:x.values.take(np.random.permutation(len(p))) ) )
    
    return pd.concat(newtrain)

In [5]:
param = {
    #'bagging_freq': 5,
    #'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [6]:
train_augmented = augment(train)
train_data = lgb.Dataset(train_augmented.drop('target', axis = 1), train_augmented['target'])
# 11500 -> ?(just run); 12985 -> 0.900; 14500 -> 0.900; 16000 -> 0.900; 17500 -> 0.900; 19000 -> 0.900; 23000 -> 0.899
model = lgb.train(param, train_data, 11500)
model.save_model('lgbm_model_11500')
prediction = model.predict(test)

In [7]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['target'] = prediction
filename = "sub_{:%Y-%m-%d_%H_%M}.csv".format(datetime.now())
submission.to_csv(filename, index=False)

# Inspired by:

[1] [Ole Morten Grodås, Lightgbm with data augmentation](https://www.kaggle.com/omgrodas/lightgbm-with-data-augmentation)

[2] [Jiwei Liu, LGB 2 leaves + augment](https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment)