In [2]:
#import dependencies
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
#importing dataset :
dataset = pd.read_csv(r"/workspaces/machine_failure/dataset/ai4i2020.csv")
dataset.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [5]:
dataset.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
target = ['TWF','HDF','PWF','OSF','RNF']
#not including machine failure in target.
input_features = ['Type','Air temperature [K]','Process temperature [K]','Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']
sorted_dataset = pd.concat([dataset[input_features],dataset[target]],axis = 1)
sorted_dataset.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0


In [7]:
#feature engineering:
#introducing new feature sets here :
#del_T = process_temp-air_temp
sorted_dataset['del_T'] = sorted_dataset['Process temperature [K]'] - sorted_dataset['Air temperature [K]']

#power_proxy = rotation_speed*torque
sorted_dataset['power proxy'] = sorted_dataset['Rotational speed [rpm]'] * sorted_dataset['Torque [Nm]']

#wear_rate = tool_wear / rotational
sorted_dataset['wear_rate'] = sorted_dataset['Tool wear [min]'] / sorted_dataset['Rotational speed [rpm]']
sorted_dataset.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF,del_T,power proxy,wear_rate
0,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,10.5,66382.8,0.0
1,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,10.5,65190.4,0.002131
2,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,10.4,74001.2,0.003338
3,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,10.4,56603.5,0.004885
4,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,10.5,56320.0,0.006392


In [8]:
X = sorted_dataset.drop(target,axis = 1)
Y = sorted_dataset[target]
print(X.head())
Y.head()
print(X.shape)

  Type  Air temperature [K]  Process temperature [K]  Rotational speed [rpm]  \
0    M                298.1                    308.6                    1551   
1    L                298.2                    308.7                    1408   
2    L                298.1                    308.5                    1498   
3    L                298.2                    308.6                    1433   
4    L                298.2                    308.7                    1408   

   Torque [Nm]  Tool wear [min]  del_T  power proxy  wear_rate  
0         42.8                0   10.5      66382.8   0.000000  
1         46.3                3   10.5      65190.4   0.002131  
2         49.4                5   10.4      74001.2   0.003338  
3         39.5                7   10.4      56603.5   0.004885  
4         40.0                9   10.5      56320.0   0.006392  
(10000, 9)


In [9]:
num_cols = [col for col in X.columns if X[col].dtype != 'object']
num_cols

['Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'del_T',
 'power proxy',
 'wear_rate']

In [10]:
#splitting the dataset into train_test_split:
X_train,X_test,y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state =42)
X_train.shape,X_test.shape

((8000, 9), (2000, 9))

In [11]:
# #creating a pipeline for data preprocessing:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
dp_pipeline = ColumnTransformer(
    transformers=[('ohe',OneHotEncoder(),['Type']),
     ('strd',StandardScaler(),num_cols)]
)

In [12]:
#ONE VS REST strategy:
# using different model for each label:
#logistic regression
# Random forest classification
# XGboost
# LightGBM
#creating pipeline for each model with preprocessor:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
model_lr = Pipeline([('preproc',dp_pipeline),('lr',LogisticRegression(max_iter = 2000,class_weight = 'balanced'))])
model_rfr = Pipeline([('preproc',dp_pipeline),('rfr',RandomForestClassifier(random_state = 42,n_jobs = -1,class_weight = 'balanced'))])
model_XGB = Pipeline([('preproc',dp_pipeline),('xgb',XGBClassifier(n_jobs = -1))])
model_lgb = Pipeline([('preproc',dp_pipeline),('lgb',lgb.LGBMClassifier(n_jobs = -1))])
models = {'lr' : model_lr,'rfr': model_rfr,'xgb' : model_XGB,'lgb' : model_lgb}


In [13]:
#now creating CV with multilabelstratifiedkfold:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 42)
results = {}
for trg in target:
    y = y_train[trg]
    results[trg] = {}
    if trg in ['RNF','TWF']:
        neg,pos = y.value_counts()[0],y.value_counts()[1]
        ratio = neg/pos
        print(f"Applying imbalancing handling for labels :{ratio:.2f}")
        #update model parametres:
        models['lr'].set_params(lr__class_weight = 'balanced')
        models['rfr'].set_params(rfr__class_weight = 'balanced')
        models['xgb'].set_params(xgb__scale_pos_weight = ratio)
        models['lgb'].set_params(lgb__scale_pos_weight = ratio)
        
    for name,model in models.items():
        oof_pred = cross_val_predict(model,X_train,y,cv = cv.split(X_train,y),method = 'predict')
        f1score = f1_score(oof_pred,y)
        results[trg][name] = f1score
print(pd.DataFrame(results)) 

Applying imbalancing handling for labels :227.57
[LightGBM] [Info] Number of positive: 28, number of negative: 6372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004375 -> initscore=-5.427464
[LightGBM] [Info] Start training from score -5.427464
[LightGBM] [Info] Number of positive: 28, number of negative: 6372
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: 



[LightGBM] [Info] Number of positive: 28, number of negative: 6372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004375 -> initscore=-5.427464
[LightGBM] [Info] Start training from score -5.427464
[LightGBM] [Info] Number of positive: 28, number of negative: 6372
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004375 -> initscore=-5.427464
[LightGBM] [



[LightGBM] [Info] Number of positive: 79, number of negative: 6321
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.012344 -> initscore=-4.382185
[LightGBM] [Info] Start training from score -4.382185
[LightGBM] [Info] Number of positive: 79, number of negative: 6321
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.012344 -> initscore=-4.382185
[LightGBM] [Info] Start training from score -4.382185




[LightGBM] [Info] Number of positive: 78, number of negative: 6322
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.012188 -> initscore=-4.395082
[LightGBM] [Info] Start training from score -4.395082
[LightGBM] [Info] Number of positive: 78, number of negative: 6322
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.012188 -> initscore=-4.395082
[LightGBM] [Info] Start training from score -4.395082
[LightGBM] [Info] Numb



[LightGBM] [Info] Number of positive: 60, number of negative: 6340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289
[LightGBM] [Info] Number of positive: 60, number of negative: 6340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289




[LightGBM] [Info] Number of positive: 60, number of negative: 6340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289




[LightGBM] [Info] Number of positive: 60, number of negative: 6340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289
[LightGBM] [Info] Number of positive: 60, number of negative: 6340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289




[LightGBM] [Info] Number of positive: 64, number of negative: 6336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120
[LightGBM] [Info] Number of positive: 64, number of negative: 6336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1521
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120




[LightGBM] [Info] Number of positive: 64, number of negative: 6336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120
[LightGBM] [Info] Number of positive: 64, number of negative: 6336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120




[LightGBM] [Info] Number of positive: 64, number of negative: 6336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1521
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120
Applying imbalancing handling for labels :614.38




[LightGBM] [Info] Number of positive: 11, number of negative: 6389
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001719 -> initscore=-6.364438
[LightGBM] [Info] Start training from score -6.364438
[LightGBM] [Info] Number of positive: 11, number of negative: 6389
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001719 -> initscore=-6.364438
[LightGBM] [Info] Start training from score -6.364438
[LightGBM] [Info] Numb



[LightGBM] [Info] Number of positive: 10, number of negative: 6390
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1519
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001563 -> initscore=-6.459904
[LightGBM] [Info] Start training from score -6.459904
[LightGBM] [Info] Number of positive: 10, number of negative: 6390
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001563 -> initscore=-6.459904
[LightGBM] [Info] Start training from score -6.459904
          TWF       HD



In [14]:
print(pd.DataFrame(results))

          TWF       HDF       PWF       OSF       RNF
lr   0.084768  0.461176  0.462025  0.692641  0.005761
rfr  0.000000  0.939394  0.972973  0.828571  0.000000
xgb  0.086957  0.965174  0.869565  0.911243  0.000000
lgb  0.019538  1.000000  0.802817  0.895706  0.002432


In [15]:
cv = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 42)
results = {}
for trg in target:
    y = Y_test[trg]
    results[trg] = {}
    if trg in ['RNF','TWF']:
        neg,pos = y.value_counts()[0],y.value_counts()[1]
        ratio = neg/pos
        print(f"Applying imbalancing handling for labels :{ratio:.2f}")
        #update model parametres:
        models['lr'].set_params(lr__class_weight = 'balanced')
        models['rfr'].set_params(rfr__class_weight = 'balanced')
        models['xgb'].set_params(xgb__scale_pos_weight = ratio)
        models['lgb'].set_params(lgb__scale_pos_weight = ratio)
        
    for name,model in models.items():
        oof_pred = cross_val_predict(model,X_test,y,cv = cv.split(X_test,y),method = 'predict')
        f1score = f1_score(oof_pred,y)
        results[trg][name] = f1score
print(pd.DataFrame(results)) 

Applying imbalancing handling for labels :180.82
[LightGBM] [Info] Number of positive: 9, number of negative: 1591
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1476
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005625 -> initscore=-5.174893
[LightGBM] [Info] Start training from score -5.174893
[LightGBM] [Info] Number of positive: 9, number of negative: 1591
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005625 -> initscore=-5.174893
[LightGBM] [Info] Start train



[LightGBM] [Info] Number of positive: 8, number of negative: 1592
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005000 -> initscore=-5.293305
[LightGBM] [Info] Start training from score -5.293305




[LightGBM] [Info] Number of positive: 14, number of negative: 1586
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008750 -> initscore=-4.729913
[LightGBM] [Info] Start training from score -4.729913
[LightGBM] [Info] Number of positive: 14, number of negative: 1586
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1478
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008750 -> initscore=-4.729913
[LightGBM] [Info] Start training from score -4.729913
[LightGBM] [Info] Numb



[LightGBM] [Info] Number of positive: 13, number of negative: 1587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1475
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008125 -> initscore=-4.804651
[LightGBM] [Info] Start training from score -4.804651




[LightGBM] [Info] Number of positive: 16, number of negative: 1584
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120
[LightGBM] [Info] Number of positive: 16, number of negative: 1584
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1480
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120
[LightGBM] [Info] Numb



[LightGBM] [Info] Number of positive: 16, number of negative: 1584
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1482
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120
[LightGBM] [Info] Number of positive: 16, number of negative: 1584
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010000 -> initscore=-4.595120
[LightGBM] [Info] Start training from score -4.595120




[LightGBM] [Info] Number of positive: 15, number of negative: 1585
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289
[LightGBM] [Info] Number of positive: 15, number of negative: 1585
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1476
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009375 -> initscore=-4.660289
[LightGBM] [Info] Start training from score -4.660289
[LightGBM] [Info] Numb



[LightGBM] [Info] Number of positive: 14, number of negative: 1586
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008750 -> initscore=-4.729913
[LightGBM] [Info] Start training from score -4.729913
[LightGBM] [Info] Number of positive: 14, number of negative: 1586
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1475
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008750 -> initscore=-4.729913
[LightGBM] [Info] Start training from score -4.729913




Applying imbalancing handling for labels :332.33
[LightGBM] [Info] Number of positive: 5, number of negative: 1595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003125 -> initscore=-5.765191
[LightGBM] [Info] Start training from score -5.765191
[LightGBM] [Info] Number of positive: 5, number of negative: 1595
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003125 -> initscore=-5.765191
[LightGBM] [Info] Start train



In [16]:
import joblib

# Save full pipeline
joblib.dump(model_lr, 'final_model_pipeline_lr.pkl')
joblib.dump(model_rfr, 'final_model_pipeline_rfr.pkl')
joblib.dump(model_XGB, 'final_model_pipeline_xgb.pkl')
joblib.dump(model_lgb, 'final_model_pipeline_lgb.pkl')



['final_model_pipeline_lgb.pkl']