In [13]:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
np.unique(data.target )

print(dir(data))
print(data.feature_names)


['DESCR', 'data', 'feature_names', 'target', 'target_names']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


# LGB

In [7]:
from sklearn.datasets import load_wine
data = load_wine()
np.unique(data.target )


array([0, 1, 2])

# KFold

In [5]:
#%%time
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
import logging
format_str = '%(asctime)s %(filename)s[%(lineno)d] %(levelname)s %(message)s'
format = logging.Formatter(format_str)
logging.basicConfig(level=logging.DEBUG, format=format_str)
logger = logging.getLogger()

iris=datasets.load_iris()
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state = 5)



def train(X_data,  y_data,  X_test ):
    
    num_fold = 5
    num_class = 3
    folds = KFold(n_splits=num_fold, shuffle=True, random_state=15)
    oof = np.zeros((len(y_data),num_class))
    predictions = np.zeros((len(X_test),num_class))
    #start = time.time()
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data.values, y_data.values)):
        logger.info("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_data.iloc[trn_idx], y_data.iloc[trn_idx])
        val_data = lgb.Dataset(X_data.iloc[val_idx], y_data.iloc[val_idx], reference=trn_data)

        #np.random.seed(666)
        params={
            #'verbose':2,
            'learning_rate':0.1,
            'lambda_l1':0.1,
            'lambda_l2':0.2,
            'max_depth':4,
            'objective':'multiclass',
            'num_class':num_class,  #lightgbm.basic.LightGBMError: b'Number of classes should be specified and greater than 1 for multiclass training'  
            #'device':'gpu',
            #'gpu_platform_id': 1, 'gpu_device_id': 0
        }
        num_round = 30000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=2000,
                        early_stopping_rounds=200)
        

        oof[val_idx] = clf.predict(X_data.iloc[val_idx], num_iteration=clf.best_iteration)
        logger.info(f'fold n{fold_}, best_iter:{clf.best_iteration}, val shape:{X_data.iloc[val_idx].shape}')

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] =  X_data.columns
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions += clf.predict(X_test, num_iteration=clf.best_iteration)
    predictions = predictions/folds.n_splits
    oof = oof.argmax(axis=1)
    score = accuracy_score(oof,y_data.values)
    return predictions, score

train_data = pd.DataFrame(iris.data, columns=iris.feature_names)
train_target = pd.DataFrame(iris.target, columns=['label'])

X_train,X_test,y_train,y_test=train_test_split(train_data,train_target,test_size=0.3)

predictions, score = train(X_train,y_train,X_test)

print(predictions.shape, score)
 

2019-04-17 02:18:45,797 <ipython-input-5-38efeb09053d>[31] INFO fold n°0


Training until validation scores don't improve for 200 rounds.


2019-04-17 02:18:46,141 <ipython-input-5-38efeb09053d>[57] INFO fold n0, best_iter:44, val shape:(21, 4)
2019-04-17 02:18:46,150 <ipython-input-5-38efeb09053d>[31] INFO fold n°1


Early stopping, best iteration is:
[44]	training's multi_logloss: 0.0575746	valid_1's multi_logloss: 0.110612
Training until validation scores don't improve for 200 rounds.


2019-04-17 02:18:46,544 <ipython-input-5-38efeb09053d>[57] INFO fold n1, best_iter:94, val shape:(21, 4)
2019-04-17 02:18:46,550 <ipython-input-5-38efeb09053d>[31] INFO fold n°2


Early stopping, best iteration is:
[94]	training's multi_logloss: 0.0444158	valid_1's multi_logloss: 0.0912174
Training until validation scores don't improve for 200 rounds.
[2000]	training's multi_logloss: 0.014799	valid_1's multi_logloss: 0.0057197
Early stopping, best iteration is:
[2401]	training's multi_logloss: 0.0147935	valid_1's multi_logloss: 0.00571965


2019-04-17 02:18:49,548 <ipython-input-5-38efeb09053d>[57] INFO fold n2, best_iter:2401, val shape:(21, 4)
2019-04-17 02:18:49,593 <ipython-input-5-38efeb09053d>[31] INFO fold n°3


Training until validation scores don't improve for 200 rounds.


2019-04-17 02:18:49,937 <ipython-input-5-38efeb09053d>[57] INFO fold n3, best_iter:32, val shape:(21, 4)
2019-04-17 02:18:49,954 <ipython-input-5-38efeb09053d>[31] INFO fold n°4


Early stopping, best iteration is:
[32]	training's multi_logloss: 0.0807604	valid_1's multi_logloss: 0.354552
Training until validation scores don't improve for 200 rounds.


2019-04-17 02:18:50,316 <ipython-input-5-38efeb09053d>[57] INFO fold n4, best_iter:63, val shape:(21, 4)


Early stopping, best iteration is:
[63]	training's multi_logloss: 0.056934	valid_1's multi_logloss: 0.0380282
(45, 3) 0.9523809523809523


# KFold is an option

In [26]:
#%%time
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
import logging
format_str = '%(asctime)s %(filename)s[%(lineno)d] %(levelname)s %(message)s'
format = logging.Formatter(format_str)
logging.basicConfig(level=logging.DEBUG, format=format_str)
logger = logging.getLogger()

iris=datasets.load_iris()
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state = 5)



def train(X_data,  y_data,  X_test, cv=False ):
    
    num_fold = 5
    num_class = 3
    folds = KFold(n_splits=num_fold, shuffle=True, random_state=15)
    oof = np.zeros((len(y_data),num_class))
    predictions = np.zeros((len(X_test),num_class))
    #start = time.time()
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data.values, y_data.values)):
        logger.info("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_data.iloc[trn_idx], y_data.iloc[trn_idx])
        val_data = lgb.Dataset(X_data.iloc[val_idx], y_data.iloc[val_idx], reference=trn_data)

        #np.random.seed(666)
        params={
            #'verbose':2,
            'learning_rate':0.1,
            'lambda_l1':0.1,
            'lambda_l2':0.2,
            'max_depth':4,
            'objective':'multiclass',
            'num_class':num_class,  #lightgbm.basic.LightGBMError: b'Number of classes should be specified and greater than 1 for multiclass training'  
            #'device':'gpu',
            #'gpu_platform_id': 1, 'gpu_device_id': 0
        }
        num_round = 30000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=2000,
                        early_stopping_rounds=200)
        

        oof[val_idx] = clf.predict(X_data.iloc[val_idx], num_iteration=clf.best_iteration)
        
        score = accuracy_score(y_data.values[val_idx], oof.argmax(axis=1)[val_idx],)
        logger.info(f'fold n{fold_}, best_iter:{clf.best_iteration}, score:{score:6.4f} val shape:{X_data.iloc[val_idx].shape}')

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] =  X_data.columns
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        if cv:
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration)
        else:
            logger.info('Cv is disable, will train with full train data')
            all_train = lgb.Dataset(X_data, y_data)
            clf = lgb.train(params,
                all_train,
                #num_round,
                num_boost_round=clf.best_iteration,
                valid_sets=[all_train],
                verbose_eval=2000,
                )
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration)
            break
        
    predictions = predictions/(fold_ + 1)
    if cv:
        score = accuracy_score(y_data.values, oof.argmax(axis=1),)
    return predictions, score

train_data = pd.DataFrame(iris.data, columns=iris.feature_names)
train_target = pd.DataFrame(iris.target, columns=['label'])

X_train,X_test,y_train,y_test=train_test_split(train_data,train_target,test_size=0.3)

predictions, score = train(X_train,y_train,X_test,cv=False)

print(predictions.shape, score)
 

2019-04-17 02:33:08,368 <ipython-input-26-0e8683dfc7e4>[31] INFO fold n°0


Training until validation scores don't improve for 200 rounds.


2019-04-17 02:33:08,716 <ipython-input-26-0e8683dfc7e4>[59] INFO fold n0, best_iter:62, score:0.9524 val shape:(21, 4)
2019-04-17 02:33:08,719 <ipython-input-26-0e8683dfc7e4>[69] INFO Cv is disable, will train with full train data


Early stopping, best iteration is:
[62]	training's multi_logloss: 0.0557193	valid_1's multi_logloss: 0.152055
(45, 3) 0.9523809523809523


In [17]:
predictions

array([[3.21976062e-03, 2.60045835e-03, 9.94179781e-01],
       [9.97063939e-01, 2.10587025e-03, 8.30191040e-04],
       [3.20236048e-03, 7.99057548e-03, 9.88807064e-01],
       [5.12637579e-03, 1.94761178e-01, 8.00112446e-01],
       [1.12414238e-03, 8.00723498e-03, 9.90868623e-01],
       [5.78657694e-03, 9.74239953e-01, 1.99734698e-02],
       [8.91810047e-03, 6.28298381e-01, 3.62783519e-01],
       [3.31297382e-03, 9.83142453e-01, 1.35445733e-02],
       [3.20236048e-03, 7.99057548e-03, 9.88807064e-01],
       [5.85551783e-02, 9.22782251e-01, 1.86625707e-02],
       [9.92663557e-01, 6.46664121e-03, 8.69801599e-04],
       [9.92706516e-01, 6.46692106e-03, 8.26562895e-04],
       [2.82062578e-03, 1.07161165e-01, 8.90018209e-01],
       [9.97063939e-01, 2.10587025e-03, 8.30191040e-04],
       [9.96522731e-01, 2.49404791e-03, 9.83221174e-04],
       [4.70619337e-03, 9.80835112e-01, 1.44586944e-02],
       [7.70795620e-03, 9.66559297e-01, 2.57327469e-02],
       [9.91625419e-01, 6.62168

# Normal

In [57]:
%%time

import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris=datasets.load_iris()
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.3)
import numpy as np
train_data=lgb.Dataset(X_train,label=y_train)
validation_data=lgb.Dataset(X_test,label=y_test)
params={
    #'verbose':2,
    'learning_rate':0.1,
    'lambda_l1':0.1,
    'lambda_l2':0.2,
    'max_depth':4,
    'objective':'multiclass',
    'num_class':3,  #lightgbm.basic.LightGBMError: b'Number of classes should be specified and greater than 1 for multiclass training'  
    #'device':'gpu',
    'gpu_platform_id': 1, 'gpu_device_id': 0
}
clf=lgb.train(params,train_data,valid_sets=[validation_data], verbose_eval=50,)
from sklearn.metrics import roc_auc_score,accuracy_score
y_pred=clf.predict(X_test)
y_pred=[list(x).index(max(x)) for x in y_pred]
print(y_pred)
print(accuracy_score(y_test,y_pred))

[50]	valid_0's multi_logloss: 0.254046
[100]	valid_0's multi_logloss: 0.330064
[1, 1, 2, 2, 1, 0, 2, 2, 0, 2, 0, 1, 1, 2, 1, 0, 2, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2, 1, 2, 0, 2, 2]
0.9111111111111111
CPU times: user 46 s, sys: 11.9 ms, total: 46 s
Wall time: 651 ms
