In [None]:
pip install lightgbm xgboost catboost flaml

In [3]:
import pandas as pd
import numpy as np

from flaml import AutoML
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

In [4]:
train = pd.read_csv('Data/train.csv').drop(columns = ['id'])
original = pd.read_csv('Data/original.csv').drop(columns = ['UDI'])
test = pd.read_csv('Data/test.csv').drop(columns = ['id'])
sub = pd.read_csv('Data/sample_submission.csv')

In [5]:
train.columns = ['Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
original.columns = ['Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
test.columns = ['Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']

In [6]:
train.head()

Unnamed: 0,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


### Feature Engineering:

In [7]:
## Creating 'generated' variable
train['generated'] = 1
original['generated'] = 0
test['generated'] = 1

## Concatenating training data with original data
train = pd.concat([train, original], axis = 0).reset_index(drop = True)

## Label encoding the 'type' variable
LE = LabelEncoder()
train['Type'] = LE.fit_transform(train['Type'])
test['Type'] = LE.transform(test['Type'])

## Category encoding 'Product ID' variable
train['Product ID'] = train['Product ID'].astype('category')
test['Product ID'] = test['Product ID'].astype('category')

## Some feature engineering
train['Temperature ratio'] = train['Process temperature'] / train['Air temperature']
train['Torque * Rotational speed'] = train['Torque'] * train['Rotational speed']

test['Temperature ratio'] = test['Process temperature'] / test['Air temperature']
test['Torque * Rotational speed'] = test['Torque'] * test['Rotational speed']

In [8]:
train['temp_ratio'] = train['Process temperature'] / train['Air temperature']
train['failure_sum'] = (train['TWF'] + train['HDF'] + train['PWF'] + train['OSF'] + train['RNF'])
train['power'] = train['Rotational speed'] * train['Torque']
train['temp_diff'] = train['Process temperature'] - train['Air temperature']
train['speed_to_torque_ratio'] = train['Rotational speed'] / train['Torque']
train['temp_sum'] = train['Air temperature'] + train['Process temperature']
train['wear_rate'] = train['Tool wear'] / train['Rotational speed']

test['temp_ratio'] = test['Process temperature'] / test['Air temperature']
test['failure_sum'] = (test['TWF'] + test['HDF'] + test['PWF'] + test['OSF'] + test['RNF'])
test['power'] = test['Rotational speed'] * test['Torque']
test['temp_diff'] = test['Process temperature'] - test['Air temperature']
test['speed_to_torque_ratio'] = test['Rotational speed'] / test['Torque']
test['temp_sum'] = test['Air temperature'] + test['Process temperature']
test['wear_rate'] = test['Tool wear'] / test['Rotational speed']

In [9]:
train.head()

Unnamed: 0,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,...,generated,Temperature ratio,Torque * Rotational speed,temp_ratio,failure_sum,power,temp_diff,speed_to_torque_ratio,temp_sum,wear_rate
0,L50096,1,300.6,309.6,1596,36.1,140,0,0,0,...,1,1.02994,57615.6,1.02994,0,57615.6,9.0,44.210526,610.2,0.087719
1,M20343,2,302.6,312.1,1759,29.1,200,0,0,0,...,1,1.031395,51186.9,1.031395,0,51186.9,9.5,60.446735,614.7,0.113701
2,L49454,1,299.3,308.5,1805,26.5,25,0,0,0,...,1,1.030738,47832.5,1.030738,0,47832.5,9.2,68.113208,607.8,0.01385
3,L53355,1,301.0,310.9,1524,44.3,197,0,0,0,...,1,1.03289,67513.2,1.03289,0,67513.2,9.9,34.401806,611.9,0.129265
4,M24050,2,298.0,309.0,1641,35.4,34,0,0,0,...,1,1.036913,58091.4,1.036913,0,58091.4,11.0,46.355932,607.0,0.020719


In [10]:
test.head()

Unnamed: 0,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,TWF,HDF,PWF,...,generated,Temperature ratio,Torque * Rotational speed,temp_ratio,failure_sum,power,temp_diff,speed_to_torque_ratio,temp_sum,wear_rate
0,L50896,1,302.3,311.5,1499,38.0,60,0,0,0,...,1,1.030433,56962.0,1.030433,0,56962.0,9.2,39.447368,613.8,0.040027
1,L53866,1,301.7,311.0,1713,28.8,17,0,0,0,...,1,1.030825,49334.4,1.030825,0,49334.4,9.3,59.479167,612.7,0.009924
2,L50498,1,301.3,310.4,1525,37.7,96,0,0,0,...,1,1.030202,57492.5,1.030202,0,57492.5,9.1,40.450928,611.7,0.062951
3,M21232,2,300.1,309.6,1479,47.6,5,0,0,0,...,1,1.031656,70400.4,1.031656,0,70400.4,9.5,31.071429,609.7,0.003381
4,M19751,2,303.4,312.3,1515,41.3,114,0,0,0,...,1,1.029334,62569.5,1.029334,0,62569.5,8.9,36.682809,615.7,0.075248


### Modelling:

In [11]:
from flaml import logger, logging
logger.setLevel(logging.WARNING)

In [12]:
## Defining input and target variables for training
X = train.drop(columns = ['Machine failure'])
Y = train['Machine failure']

## Creating lists to store results
aml_lgb_cv_scores, aml_lgb_preds = list(), list()
aml_cat_cv_scores, aml_cat_preds = list(), list()
aml_xgb_cv_scores, aml_xgb_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing stratified k fold
skf = StratifiedKFold(n_splits = 10, random_state = 365, shuffle = True)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------------------------')
    
    ##################
    ## FLAML (LGBM) ##
    ##################
    
    automl = AutoML()
    
    automl_settings = {'time_budget': 180,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['lgbm'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_lgb_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_lgb_pred_2 = automl.predict_proba(test)[:, 1]

    aml_lgb_score_fold = roc_auc_score(Y_valid, aml_lgb_pred_1)
    aml_lgb_cv_scores.append(aml_lgb_score_fold)
    aml_lgb_preds.append(aml_lgb_pred_2)
    
    print('Fold', i+1, '==> FLAML (LGBM) oof ROC-AUC is ==>', aml_lgb_score_fold)
    
    ######################
    ## FLAML (CatBoost) ##
    ######################
    
    automl = AutoML()
    
    automl_settings = {'time_budget': 180,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['catboost'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_cat_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_cat_pred_2 = automl.predict_proba(test)[:, 1]
    
    aml_cat_score_fold = roc_auc_score(Y_valid, aml_cat_pred_1)
    aml_cat_cv_scores.append(aml_cat_score_fold)
    aml_cat_preds.append(aml_cat_pred_2)
    
    print('Fold', i+1, '==> FLAML (CatBoost) oof ROC-AUC is ==>', aml_cat_score_fold)
    
    ######################
    ## FLAML (XGBoost)  ##
    ######################
    
    automl = AutoML()
    
    automl_settings = {'time_budget': 180,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['xgboost'],
                       "log_file_name": '',
                      }
    
    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_xgb_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_xgb_pred_2 = automl.predict_proba(test)[:, 1]
    
    aml_xgb_score_fold = roc_auc_score(Y_valid, aml_xgb_pred_1)
    aml_xgb_cv_scores.append(aml_xgb_score_fold)
    aml_xgb_preds.append(aml_xgb_pred_2)
    
    print('Fold', i+1, '==> FLAML (XGBoost) oof ROC-AUC is ==>', aml_xgb_score_fold)
    
    ######################
    ## Average Ensemble ##
    ######################
    
    ens_pred_1 = (aml_lgb_pred_1 + aml_cat_pred_1 + aml_xgb_pred_1) / 3
    ens_pred_2 = (aml_lgb_pred_2 + aml_cat_pred_2 + aml_xgb_pred_2) / 3
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    
    print('Fold', i+1, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

---------------------------------------------------------------
Fold 1 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9743831863662337
Fold 1 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9763144404979328




Fold 1 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9584907113805197
Fold 1 ==> Average Ensemble oof ROC-AUC score is ==> 0.9772678125245102
---------------------------------------------------------------
Fold 2 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9750393561832624
Fold 2 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.977161788927607




Fold 2 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9681055249918766
Fold 2 ==> Average Ensemble oof ROC-AUC score is ==> 0.9783933153312643
---------------------------------------------------------------
Fold 3 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9801325630436153
Fold 3 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9750996482804917




Fold 3 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9712433170224319
Fold 3 ==> Average Ensemble oof ROC-AUC score is ==> 0.9779534980271232
---------------------------------------------------------------
Fold 4 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9740105342866533
Fold 4 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9742634564937531




Fold 4 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9694611431693148
Fold 4 ==> Average Ensemble oof ROC-AUC score is ==> 0.9766526157429496
---------------------------------------------------------------
Fold 5 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9755795448014094
Fold 5 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9733735553580167




Fold 5 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9551715267349793
Fold 5 ==> Average Ensemble oof ROC-AUC score is ==> 0.9725158798316791
---------------------------------------------------------------
Fold 6 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9475269425625247
Fold 6 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9611373101130378




Fold 6 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9364893225814191
Fold 6 ==> Average Ensemble oof ROC-AUC score is ==> 0.9550088641351567
---------------------------------------------------------------
Fold 7 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9915991602926923
Fold 7 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9917881893002048




Fold 7 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9861234572861405
Fold 7 ==> Average Ensemble oof ROC-AUC score is ==> 0.9941480525408567
---------------------------------------------------------------
Fold 8 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9887937187125604
Fold 8 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9879658972139775




Fold 8 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9726924929117609
Fold 8 ==> Average Ensemble oof ROC-AUC score is ==> 0.9896737150073128
---------------------------------------------------------------
Fold 9 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.9860694689275373
Fold 9 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9823410635734545




Fold 9 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9757673182656985
Fold 9 ==> Average Ensemble oof ROC-AUC score is ==> 0.9863847497813959
---------------------------------------------------------------
Fold 10 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.987449127548665
Fold 10 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.9855842712241213




Fold 10 ==> FLAML (XGBoost) oof ROC-AUC is ==> 0.9769516420372288
Fold 10 ==> Average Ensemble oof ROC-AUC score is ==> 0.9862389458869512


In [11]:
flaml_lgb = np.mean(aml_lgb_cv_scores)
flaml_cat = np.mean(aml_cat_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)

print('LGBM: ', flaml_lgb)
print('CAT: ', flaml_cat)
print('ENSEMBLE: ', ens_cv_score)

LGBM:  0.9746216726897702
CAT:  0.9721700169775669
ENSEMBLE:  0.9767901656743851


In [41]:
flaml_lgb = np.mean(aml_lgb_cv_scores)
flaml_cat = np.mean(aml_cat_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)

print('LGBM: ', flaml_lgb)
print('CAT: ', flaml_cat)
print('ENSEMBLE: ', ens_cv_score)

LGBM:  0.9775494922632536
CAT:  0.9794116217013273
ENSEMBLE:  0.9804833594426844


In [13]:
flaml_lgb = np.mean(aml_lgb_cv_scores)
flaml_cat = np.mean(aml_cat_cv_scores)
flaml_xgb = np.mean(aml_xgb_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)

print('LGBM: ', flaml_lgb)
print('CAT: ', flaml_cat)
print('XGB: ', flaml_xgb)
print('ENSEMBLE: ', ens_cv_score)

LGBM:  0.9780583602725154
CAT:  0.9785029620982597
XGB:  0.9670496456381368
ENSEMBLE:  0.97942374488092


In [14]:
lgb_preds_test = pd.DataFrame(aml_lgb_preds).apply(np.mean, axis = 0)
cat_preds_test = pd.DataFrame(aml_cat_preds).apply(np.mean, axis = 0)
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

sub['Machine failure'] = lgb_preds_test
sub.to_csv('Submissions/FLAML_LGBM_submission3.csv', index = False)

sub['Machine failure'] = cat_preds_test
sub.to_csv('Submissions/FLAML_CatBoost_submission3.csv', index = False)

sub['Machine failure'] = ens_preds_test
sub.to_csv('Submissions/FLAML_Ensemble_submission3.csv', index = False)

In [54]:
## Defining the input and target variables
X = train.drop(columns = ['id', 'Product ID', 'Machine failure'], axis = 1)
Y = train['Machine failure']

X_test = test.drop(columns = ['id', 'Product ID'], axis = 1)

## Defining lists to store results
lgbm_cv_scores, lgbm_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
cat_cv_scores, cat_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing KFold cross-validation
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------')
    
    ## LightGBM
    lgbm_md = LGBMClassifier(n_estimators = 1000, max_depth = 10, learning_rate = 0.01, 
                             num_leaves = 70, reg_alpha = 3, reg_lambda = 3, subsample = 0.7, 
                             colsample_bytree = 0.7,).fit(X_train, Y_train)
    
    lgbm_pred_valid = lgbm_md.predict_proba(X_valid)[:, 1] 
    lgbm_pred_test = lgbm_md.predict_proba(X_test)[:, 1] 
    
    lgbm_score_fold = roc_auc_score(Y_valid, lgbm_pred_valid)
    
    lgbm_cv_scores.append(lgbm_score_fold)
    lgbm_preds.append(lgbm_pred_test)
    
    print('Fold', i+1, '==> LightGBM oof ROC-AUC score is ==>', lgbm_score_fold)
    
    ## XGBoost
    xgb_md = XGBClassifier(colsample_bytree = 0.7, gamma = 0.8, learning_rate = 0.01, max_depth = 8, 
                           min_child_weight = 20, n_estimators = 1000, subsample = 0.7, 
                           verbosity = 0).fit(X_train, Y_train)
        
    xgb_pred_valid = xgb_md.predict_proba(X_valid)[:, 1]
    xgb_pred_test = xgb_md.predict_proba(X_test)[:, 1]
    
    xgb_score_fold = roc_auc_score(Y_valid, xgb_pred_valid)
    
    xgb_cv_scores.append(xgb_score_fold)
    xgb_preds.append(xgb_pred_test)
    
    print('Fold', i+1, '==> XGBoost oof ROC-AUC score is ==>', xgb_score_fold)
    
    ## CatBoost
    cat_md = CatBoostClassifier(verbose = False).fit(X_train, Y_train)
        
    cat_pred_valid = cat_md.predict_proba(X_valid)[:, 1]
    cat_pred_test = cat_md.predict_proba(X_test)[:, 1]
    
    cat_score_fold = roc_auc_score(Y_valid, cat_pred_valid)
    
    cat_cv_scores.append(cat_score_fold)
    cat_preds.append(cat_pred_test)
    
    print('Fold', i+1, '==> CatBoost oof ROC-AUC score is ==>', cat_score_fold)
    
    ## Ensemble
    ens_pred_valid = (lgbm_pred_valid + xgb_pred_valid + cat_pred_valid) / 3
    ens_pred_test = (lgbm_pred_test + xgb_pred_test + cat_pred_test) / 3
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_valid)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_test)
    
    print('Fold', i+1, '==> Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    
#     X_train_ens = pd.DataFrame({'LGBM': lgbm_pred_valid.tolist(),  
#                                 'XGB': xgb_pred_valid.tolist(), 
#                                 'CAT': cat_pred_valid.tolist()})
#     X_test_ens = pd.DataFrame({'LGBM': lgbm_pred_test.tolist(), 
#                                'XGB': xgb_pred_test.tolist(), 
#                                'CAT': cat_pred_test.tolist()})
    
#     ens_md = RandomForestClassifier(max_depth = 3, n_estimators = 100, max_features = None).fit(X_train_ens, Y_valid)
    
#     ens_pred_valid = ens_md.predict_proba(X_train_ens)[:, 1]
#     ens_pred_test = ens_md.predict_proba(X_test_ens) [:, 1] 
    
#     ens_score_fold = roc_auc_score(Y_valid, ens_pred_valid)
    
#     ens_cv_scores.append(ens_score_fold)
#     ens_preds.append(ens_pred_test)
    
#     print('Fold', i+1, '==> Ensemble oof log-loss is ==>', ens_score_fold)

---------------------------------------------
Fold 1 ==> LightGBM oof ROC-AUC score is ==> 0.96333051013186
Fold 1 ==> XGBoost oof ROC-AUC score is ==> 0.9581906363895127
Fold 1 ==> CatBoost oof ROC-AUC score is ==> 0.9560036411643653
Fold 0 ==> Ensemble oof ROC-AUC score is ==> 0.9621021739115306
---------------------------------------------
Fold 2 ==> LightGBM oof ROC-AUC score is ==> 0.9670710628953038
Fold 2 ==> XGBoost oof ROC-AUC score is ==> 0.9702033584803708
Fold 2 ==> CatBoost oof ROC-AUC score is ==> 0.956769783375245
Fold 1 ==> Ensemble oof ROC-AUC score is ==> 0.9684991790843154
---------------------------------------------
Fold 3 ==> LightGBM oof ROC-AUC score is ==> 0.966418140504742
Fold 3 ==> XGBoost oof ROC-AUC score is ==> 0.9677780202423261
Fold 3 ==> CatBoost oof ROC-AUC score is ==> 0.9586977229115143
Fold 2 ==> Ensemble oof ROC-AUC score is ==> 0.9670575541561887
---------------------------------------------
Fold 4 ==> LightGBM oof ROC-AUC score is ==> 0.96239651

In [55]:
print('LightGBM avg. CV score ==>', np.mean(lgbm_cv_scores))
print('XGBoost avg. CV score ==>', np.mean(xgb_cv_scores))
print('CatBoost avg. CV score ==>', np.mean(cat_cv_scores))
print('Ensemble avg. CV score ==>', np.mean(ens_cv_scores))

LightGBM avg. CV score ==> 0.9636069602674784
XGBoost avg. CV score ==> 0.9639418326980407
CatBoost avg. CV score ==> 0.9589694604850557
Ensemble avg. CV score ==> 0.9643090620945198


In [56]:
## Finalizing submissions data files
lgbm_preds_test = np.mean(lgbm_preds, axis = 0).tolist()
xgb_preds_test = np.mean(xgb_preds, axis = 0).tolist()
cat_preds_test = np.mean(cat_preds, axis = 0).tolist()
ens_preds_test = np.mean(ens_preds, axis = 0).tolist()

sub['Machine failure'] = lgbm_preds_test
sub.to_csv('Submissions/lgbm_2.csv', index = False)

sub['Machine failure'] = xgb_preds_test
sub.to_csv('Submissions/xgb_2.csv', index = False)

sub['Machine failure'] = cat_preds_test
sub.to_csv('Submissions/cat_2.csv', index = False)

sub['Machine failure'] = ens_preds_test
sub.to_csv('Submissions/ens_2.csv', index = False)