In [35]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import auc,roc_curve
from sklearn.decomposition import TruncatedSVD

from sklearn.externals import joblib
import lightgbm as lgb

from imblearn.under_sampling import RandomUnderSampler,CondensedNearestNeighbour
from imblearn.over_sampling import RandomOverSampler,SMOTE

from random import random

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

nrow = None
number_of_time = 10

In [36]:
train = pd.read_csv('../data/train.csv',nrows=nrow)
train.drop('ID_code',inplace= True,axis=1)

In [42]:
reverse_list = [0,1,2,3,4,5,6,7,8,11,15,16,18,19,
            22,24,25,26,27,41,29,
            32,35,37,40,48,49,47,
            55,51,52,53,60,61,62,103,65,66,67,69,
            70,71,74,78,79,
            82,84,89,90,91,94,95,96,97,99,
            105,106,110,111,112,118,119,125,128,
            130,133,134,135,137,138,
            140,144,145,147,151,155,157,159,
            161,162,163,164,167,168,
            170,171,173,175,176,179,
            180,181,184,185,187,189,
            190,191,195,196,199]
reverse_list = ['var_%d'%i for i in reverse_list]
def features_engineer(x,isTrain=True,magic_count=None):#just combine with the value count
    res_x = x.copy()
    if isTrain:
        magic_count={}
        for col in x.columns:
            print(col)
            magic_count[col] = x[col].value_counts()
            magic_var = x[col].apply(lambda x_val:magic_count[col].get(x_val,1))
            res_x.insert(0,col+'_magic',magic_var)
    return(res_x,magic_count)

In [43]:
def data_augmentation(x,y,low_class_ratio = 0.2,method = 'none'):
    if method == 'random_under_sample':
        sm = RandomUnderSampler(sampling_strategy=1-low_class_ratio)
        aug_x,aug_y = sm.fit_resample(x,y)
    elif method == 'random_over_sample':
        sm = RandomOverSampler(sampling_strategy=1-low_class_ratio)
        aug_x,aug_y = sm.fit_resample(x,y)
    elif method == 'smote':
        sm = SMOTE(sampling_strategy=1-low_class_ratio,m_neighbors=10,kind='svm')
        aug_x,aug_y = sm.fit_resample(x,y)
#    elif method == 'shuffle':
        
    else:
        aug_x = x
        aug_y = y
    
    return(pd.DataFrame(aug_x,columns=x.columns),pd.DataFrame(aug_y,columns=['target']))

In [44]:
#test = pd.read_csv('../data/test.csv')
#sub = pd.DataFrame()
#sub['ID_code'] = test['ID_code']
#test.drop('ID_code',axis=1,inplace=True)
train,test = train_test_split(train,test_size = 0.3)

In [45]:
train_x = train.drop('target',axis=1).reset_index(drop=True)
train_y = pd.DataFrame(train['target']).reset_index(drop=True)
test_x = test.drop('target',axis=1).reset_index(drop=True)

In [46]:
fe_train_x,magic = features_engineer(train_x)
fe_test_x = features_engineer(test_x,False,magic)[0]

var_0
var_1
var_2
var_3
var_4
var_5
var_6
var_7
var_8
var_9
var_10
var_11


KeyboardInterrupt: 

In [None]:
#predict and combine
def models_list_predict(models,test_x):
    res = np.zeros((test.shape[0]))
    for model in models:
        res = res + model.predict(test_x)
    res = res/len(models)
    return(res)

def combine_pred(x,y,test_x):
    pred = np.zeros((test.shape[0]))
    for i in range(number_of_time):
        model = k_fold_train(x,y,'none',random()/2)
        pred = pred + model.predict(test_x)

    pred = pred/number_of_time
    return(pred)

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.2,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.04,
    'learning_rate': 0.003,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'objective': 'binary',
    'tree_learner': 'serial',
    "verbosity" : 1,
    "bagging_seed" : 15,
    "seed": 15
}

def k_fold_train(x,y,model_param = param,sample_method = 'none'):
    kf = StratifiedKFold(n_splits=3,shuffle=True)
    models = []
    for train_index,val_index in kf.split(x,y):
        train_model_x,val_model_x = x.iloc[train_index,:],x.iloc[val_index,:]
        train_model_y,val_model_y = y.iloc[train_index],y.iloc[val_index]
        n_x,n_y = data_augmentation(train_model_x,train_model_y,method=sample_method,low_class_ratio=random()/2+0.1)
        train_data = lgb.Dataset(n_x,n_y)
        val_data = lgb.Dataset(val_model_x,val_model_y)
        
        model = lgb.train(model_param,train_set=train_data,num_boost_round=100000,verbose_eval = 100,
                          early_stopping_rounds=1000,valid_sets = [train_data,val_data])
        models.append(model)
    break
    return(models)

In [None]:
base_line = k_fold_train(train_x,train_y)

In [None]:
fe_test = k_fold_train(fe_train_x,train_y)
#random_under = k_fold_train(train_x,train_y,'random_under_sample')

In [None]:
#random_over = k_fold_train(train_x,train_y,'random_over_sample')

In [None]:
#smote = k_fold_train(train_x,train_y,'smote')

In [None]:
def plotGraph(df):
    predicts_list = df.columns[1:]
    aucs = pd.DataFrame(columns=['models','auc_score'])
    y = df['label']
    for res in predicts_list:
        fpr,tpr,thres = roc_curve(y,df[res])
        auc_score = auc(fpr,tpr)
        aucs = aucs.append({'models':res,'auc_score':auc_score},ignore_index=True)
        
        plt.plot(fpr,tpr,lw=1, alpha=0.3,label = res+' : '+str(round(auc_score,3)))
    
    plt.legend(loc='lower right')
    plt.show()
    return(aucs)

In [None]:
pred = models_list_predict(base_line,test_x)

result_df = pd.DataFrame()
result_df['label'] = test['target']
result_df['base'] = pred

In [None]:
#pred = random_under.predict(test.drop('target',axis=1))
#result_df['random_under'] = pred

pred = models_list_predict(fe_test,fe_test_x)
result_df['feature_test'] = pred

#pred = models_list_predict(smote,test.drop('target',axis=1))
#result_df['smote'] = pred

In [None]:
plotGraph(result_df)

In [11]:
#final_pred = combine_pred(x,y,test)
final_pred = models_list_predict(base_line,test)

In [12]:
sub['target'] = final_pred
sub.to_csv("../data/models_list_pred_base.csv",index=False)