In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import sklearn
import lightgbm as lgb
from sklearn.metrics import accuracy_score,log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [83]:
def dataset():
    train = pd.read_csv("./train/train.csv")
    test = pd.read_csv("./test/test.csv")
    features = [f for f in test.columns if "x_" in f]
    x = train[features]
    y = train["targets"]
    x_test = test[features]
    return x,y,x_test,test

def sub(a):
    return a-1

def maipulate_Dataset(y):
    y_n = y.apply(sub)
    return y_n
    
def light_GBM_D(x,y):
    d_train = lgb.Dataset(x,label=y)
    return d_train
    
def filesub(v,y_pred,test):
    a = pd.DataFrame(y_pred,columns=['proba_1', 'proba_2', 'proba_3', 'proba_4', 'proba_5', 'proba_6', 'proba_7', 'proba_8', 'proba_9'])
    a["unique_id"] = test["unique_id"]
    columns=["unique_id",'proba_1', 'proba_2', 'proba_3', 'proba_4', 'proba_5', 'proba_6', 'proba_7', 'proba_8', 'proba_9']
    a = a[columns]
    a.to_csv(v+"_sub.csv",index=False)

def val_data_split(x,y):
    x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.10,random_state=42)
    return x_train,x_val,y_train,y_val

In [55]:
#feature scaling and manipulation

In [84]:
x,y,x_test,test = dataset()

In [57]:
x_train,x_val,y_train,y_val = val_data_split(x,y)

In [58]:
y_n = maipulate_Dataset(y_train)
y_v = maipulate_Dataset(y_val)

d_train = light_GBM_D(x_train,y_n)
d_val = light_GBM_D(x_val,y_v)

In [62]:
# Create parameters to search
gridParams = {
    'learning_rate': [0.005,0.01],
    'num_leaves': [6,8,12,16,32,64,128],
    'boosting_type' : ['gbdt'],
    'objective' : ['multiclass'],
    'max_bin':[256,512],
    'n_estimators':[10,40,100],
    'max_depth':[7,8,9]
    }

In [63]:
mdl = lgb.LGBMClassifier(objective = params['objective'],silent = False)

In [64]:
grid = GridSearchCV(mdl, gridParams,
                    verbose=1,
                    cv=3,
                    n_jobs=-1)

In [65]:
grid.fit(x, y)

Fitting 3 folds for each of 252 candidates, totalling 756 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 756 out of 756 | elapsed: 32.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective='multiclass',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.005, 0.01], 'num_leaves': [6, 8, 12, 16, 32, 64, 128], 'boosting_type': ['gbdt'], 'objective': ['multiclass'], 'max_bin': [256, 512], 'n_estimators': [10, 40, 100], 'max_depth': [7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [66]:
print(grid.best_params_)
print(grid.best_score_)

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_bin': 256, 'max_depth': 9, 'n_estimators': 100, 'num_leaves': 128, 'objective': 'multiclass'}
0.7417802503477051


In [97]:
v = grid.best_params_

In [106]:
#additional params
v['metric'] = "multi_logloss"
v['num_classes'] = 9
v['n_estimators'] = 10000
v['num_leaves'] = 80
v['learning_rate'] = 0.01
v['boosting_type'] = 'dart'
params['sub_feature'] = 0.5

In [107]:
v

{'boosting_type': 'dart',
 'learning_rate': 0.01,
 'max_depth': 9,
 'n_estimators': 10000,
 'num_leaves': 80,
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'num_classes': 9}

In [108]:
gbm = lgb.train(v,d_train,early_stopping_rounds=30,verbose_eval=10,valid_sets = [d_val,d_train],valid_names = ['eval', 'train'])

[10]	train's multi_logloss: 1.7797	eval's multi_logloss: 1.77989
[20]	train's multi_logloss: 1.64686	eval's multi_logloss: 1.66173
[30]	train's multi_logloss: 1.55369	eval's multi_logloss: 1.57808
[40]	train's multi_logloss: 1.49966	eval's multi_logloss: 1.52918
[50]	train's multi_logloss: 1.49739	eval's multi_logloss: 1.52917
[60]	train's multi_logloss: 1.47512	eval's multi_logloss: 1.51148
[70]	train's multi_logloss: 1.46609	eval's multi_logloss: 1.50575
[80]	train's multi_logloss: 1.43707	eval's multi_logloss: 1.47881
[90]	train's multi_logloss: 1.44714	eval's multi_logloss: 1.48737
[100]	train's multi_logloss: 1.43804	eval's multi_logloss: 1.48143
[110]	train's multi_logloss: 1.47546	eval's multi_logloss: 1.51749
[120]	train's multi_logloss: 1.42217	eval's multi_logloss: 1.46842
[130]	train's multi_logloss: 1.4	eval's multi_logloss: 1.44769
[140]	train's multi_logloss: 1.39628	eval's multi_logloss: 1.44647
[150]	train's multi_logloss: 1.38894	eval's multi_logloss: 1.43938
[160]	tra

[1220]	train's multi_logloss: 0.555399	eval's multi_logloss: 0.715677
[1230]	train's multi_logloss: 0.552333	eval's multi_logloss: 0.713289
[1240]	train's multi_logloss: 0.554463	eval's multi_logloss: 0.715191
[1250]	train's multi_logloss: 0.551469	eval's multi_logloss: 0.713142
[1260]	train's multi_logloss: 0.548579	eval's multi_logloss: 0.711126
[1270]	train's multi_logloss: 0.547466	eval's multi_logloss: 0.710381
[1280]	train's multi_logloss: 0.547615	eval's multi_logloss: 0.710684
[1290]	train's multi_logloss: 0.549876	eval's multi_logloss: 0.712554
[1300]	train's multi_logloss: 0.543364	eval's multi_logloss: 0.707426
[1310]	train's multi_logloss: 0.537154	eval's multi_logloss: 0.702875
[1320]	train's multi_logloss: 0.537287	eval's multi_logloss: 0.703116
[1330]	train's multi_logloss: 0.529792	eval's multi_logloss: 0.697447
[1340]	train's multi_logloss: 0.530642	eval's multi_logloss: 0.698225
[1350]	train's multi_logloss: 0.531157	eval's multi_logloss: 0.698784
[1360]	train's multi

[2400]	train's multi_logloss: 0.335238	eval's multi_logloss: 0.581109
[2410]	train's multi_logloss: 0.334195	eval's multi_logloss: 0.580618
[2420]	train's multi_logloss: 0.333091	eval's multi_logloss: 0.580182
[2430]	train's multi_logloss: 0.33302	eval's multi_logloss: 0.580213
[2440]	train's multi_logloss: 0.331944	eval's multi_logloss: 0.579788
[2450]	train's multi_logloss: 0.33093	eval's multi_logloss: 0.579302
[2460]	train's multi_logloss: 0.328956	eval's multi_logloss: 0.578404
[2470]	train's multi_logloss: 0.327846	eval's multi_logloss: 0.577924
[2480]	train's multi_logloss: 0.326442	eval's multi_logloss: 0.577382
[2490]	train's multi_logloss: 0.326725	eval's multi_logloss: 0.577597
[2500]	train's multi_logloss: 0.325072	eval's multi_logloss: 0.576958
[2510]	train's multi_logloss: 0.32314	eval's multi_logloss: 0.576107
[2520]	train's multi_logloss: 0.322316	eval's multi_logloss: 0.575794
[2530]	train's multi_logloss: 0.321234	eval's multi_logloss: 0.575329
[2540]	train's multi_lo

KeyboardInterrupt: 

In [109]:
y_pred = gbm.predict(x_test)

In [110]:
filesub("sixth",y_pred,test)