In [1]:
#%%time
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
import logging
format_str = '%(asctime)s %(filename)s[%(lineno)d] %(levelname)s %(message)s'
format = logging.Formatter(format_str)
logging.basicConfig(level=logging.DEBUG, format=format_str)
logger = logging.getLogger()

iris=datasets.load_iris()
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state = 5)

from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    num_sample = len(y_true)
    y_hat =  y_hat.reshape(-1, num_sample).T.argmax(axis=1) 
    #y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    score = f1_score(y_true, y_hat, average='weighted')
    return 'f1', round(score, 4) , True

# evals_result = {}

# clf = lgb.train(param, train_data, valid_sets=[val_data, train_data], valid_names=['val', 'train'], feval=lgb_f1_score, evals_result=evals_result)

# lgb.plot_metric(evals_result, metric='f1')


def train(X_data,  y_data,  X_test, cv=False ):
    
    num_fold = 5
    num_class = 3
    folds = KFold(n_splits=num_fold, shuffle=True, random_state=15)
    oof = np.zeros((len(y_data),num_class))
    predictions = np.zeros((len(X_test),num_class))
    #start = time.time()
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data.values, y_data.values)):
        logger.info("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_data.iloc[trn_idx], y_data.iloc[trn_idx])
        val_data = lgb.Dataset(X_data.iloc[val_idx], y_data.iloc[val_idx], reference=trn_data)

        #np.random.seed(666)
        params={
            #'verbose':2,
            'learning_rate':0.1,
            'lambda_l1':0.1,
            'lambda_l2':0.2,
            'max_depth':4,
            'objective':'multiclass',
            'metric': 'None', # which is need when early stop by feval, such as F1
            'num_class':num_class,  #lightgbm.basic.LightGBMError: b'Number of classes should be specified and greater than 1 for multiclass training'  
            #'device':'gpu',
            #'gpu_platform_id': 1, 'gpu_device_id': 0
        }
        num_round = 30000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data], 
                        feval=lgb_f1_score,
                        verbose_eval=10,
                        
                        early_stopping_rounds=50)
        

        oof[val_idx] = clf.predict(X_data.iloc[val_idx], num_iteration=clf.best_iteration)
        
        score = accuracy_score(y_data.values[val_idx], oof.argmax(axis=1)[val_idx],)
        logger.info(f'fold n{fold_}, best_iter:{clf.best_iteration}, score:{score:6.4f} val shape:{X_data.iloc[val_idx].shape}')

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] =  X_data.columns
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        if cv:
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration)
        else:
            logger.info(f'CV is disable, will train with full train data with iter:{clf.best_iteration}')
            all_train = lgb.Dataset(X_data, y_data)
            evals_result = {}
            clf = lgb.train(params,
                all_train,
                num_boost_round =clf.best_iteration,
                #num_boost_round=
                valid_sets=[all_train],
                feval=lgb_f1_score,
                verbose_eval=10,
                evals_result=evals_result
                )
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration)
            #print('evals_result', evals_result)
            break
        
    predictions = predictions/(fold_ + 1)
    if cv:
        score = accuracy_score(y_data.values, oof.argmax(axis=1),)
    return predictions, score

train_data = pd.DataFrame(iris.data, columns=iris.feature_names)
train_target = pd.DataFrame(iris.target, columns=['label'])

X_train,X_test,y_train,y_test=train_test_split(train_data,train_target,test_size=0.3,random_state=0 )

predictions, score = train(X_train,y_train,X_test,cv=False)

print(predictions.shape, score)
 

2019-04-17 23:33:08,893 <ipython-input-1-ce4a77eab952>[46] INFO fold n°0
  'precision', 'predicted', average, warn_for)
2019-04-17 23:33:09,098 <ipython-input-1-ce4a77eab952>[77] INFO fold n0, best_iter:16, score:0.8571 val shape:(21, 4)
2019-04-17 23:33:09,108 <ipython-input-1-ce4a77eab952>[87] INFO CV is disable, will train with full train data with iter:16


Training until validation scores don't improve for 50 rounds.
[10]	training's f1: 0.9881	valid_1's f1: 0.8544
[20]	training's f1: 1	valid_1's f1: 0.8584
[30]	training's f1: 1	valid_1's f1: 0.8584
[40]	training's f1: 1	valid_1's f1: 0.8584
[50]	training's f1: 1	valid_1's f1: 0.8584
[60]	training's f1: 1	valid_1's f1: 0.8584
Early stopping, best iteration is:
[16]	training's f1: 1	valid_1's f1: 0.8584
[10]	training's f1: 0.962
(45, 3) 0.8571428571428571
