In [None]:
pip install lightgbm xgboost catboost flaml

In [14]:
import pandas as pd
import numpy as np

from flaml import AutoML
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

In [25]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
sub = pd.read_csv('Data/sample_submission.csv')

In [26]:
train.columns = ['id', 'Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
test.columns = ['id', 'Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']

In [27]:
train.head()

Unnamed: 0,id,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


### Feature Engineering:

In [28]:
LE = LabelEncoder()

train['Type'] = LE.fit_transform(train['Type'])
test['Type'] = LE.transform(test['Type'])

In [29]:
train['Temperature ratio'] = train['Process temperature'] / train['Air temperature']
train['Torque * Rotational speed'] = train['Torque'] * train['Rotational speed']

test['Temperature ratio'] = test['Process temperature'] / test['Air temperature']
test['Torque * Rotational speed'] = test['Torque'] * test['Rotational speed']

In [51]:
train['Failure Sum'] = (train['TWF'] + train['HDF'] + train['PWF'] + train['OSF'] + train['RNF'])
train['Power'] = train['Rotational speed'] * train['Torque']
train['Temp_diff'] = train['Process temperature'] - train['Air temperature']
train['Speed_to_Torque_ratio'] = train['Rotational speed'] / train['Torque']
train['Temp_sum'] = train['Air temperature'] + train['Process temperature']
train['Wear_rate'] = train['Tool wear'] / train['Rotational speed']

test['Failure Sum'] = (test['TWF'] + test['HDF'] + test['PWF'] + test['OSF'] + test['RNF'])
test['Power'] = test['Rotational speed'] * test['Torque']
test['Temp_diff'] = test['Process temperature'] - test['Air temperature']
test['Speed_to_Torque_ratio'] = test['Rotational speed'] / test['Torque']
test['Temp_sum'] = test['Air temperature'] + test['Process temperature']
test['Wear_rate'] = test['Tool wear'] / test['Rotational speed']


In [52]:
# efficiency_by_product = train.groupby('Product ID')['Efficiency_index'].agg([np.mean]).reset_index()
# efficiency_by_product.columns = ['Product ID','mean_efficiency']

# train = pd.merge(train, efficiency_by_product, how = 'left', on = 'Product ID')

# efficiency_by_product = test.groupby('Product ID')['Efficiency_index'].agg([np.mean]).reset_index()
# efficiency_by_product.columns = ['Product ID','mean_efficiency']

# test = pd.merge(test, efficiency_by_product, how = 'left', on = 'Product ID')

In [30]:
train.head()

Unnamed: 0,id,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF,Temperature ratio,Torque * Rotational speed
0,0,L50096,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,1.02994,57615.6
1,1,M20343,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,1.031395,51186.9
2,2,L49454,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,1.030738,47832.5
3,3,L53355,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,1.03289,67513.2
4,4,M24050,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,1.036913,58091.4


In [31]:
test.head()

Unnamed: 0,id,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,TWF,HDF,PWF,OSF,RNF,Temperature ratio,Torque * Rotational speed
0,136429,L50896,1,302.3,311.5,1499,38.0,60,0,0,0,0,0,1.030433,56962.0
1,136430,L53866,1,301.7,311.0,1713,28.8,17,0,0,0,0,0,1.030825,49334.4
2,136431,L50498,1,301.3,310.4,1525,37.7,96,0,0,0,0,0,1.030202,57492.5
3,136432,M21232,2,300.1,309.6,1479,47.6,5,0,0,0,0,0,1.031656,70400.4
4,136433,M19751,2,303.4,312.3,1515,41.3,114,0,0,0,0,0,1.029334,62569.5


### Modelling:

In [None]:
train['Product ID'] = train['Product ID'].astype('category')
test['Product ID'] = test['Product ID'].astype('category')

train = train.drop(columns = ['id'])
test = test.drop(columns = ['id'])

X = train.drop(columns = ['Machine failure'])
Y = train['Machine failure']

aml_lgb_cv_scores, aml_lgb_preds = list(), list()
aml_cat_cv_scores, aml_cat_preds = list(), list()

ens_cv_scores, ens_preds = list(), list()

skf = StratifiedKFold(n_splits = 15, random_state = 42, shuffle = True)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------------------------')
    
    ##################
    ## FLAML (LGBM) ##
    ##################
    
    automl = AutoML()
    
    automl_settings = {'time_budget': 120,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['lgbm'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_lgb_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_lgb_pred_2 = automl.predict_proba(test)[:, 1]

    aml_lgb_score_fold = roc_auc_score(Y_valid, aml_lgb_pred_1)
    aml_lgb_cv_scores.append(aml_lgb_score_fold)
    aml_lgb_preds.append(aml_lgb_pred_2)
    
    print('Fold', i+1, '==> FLAML (LGBM) oof ROC-AUC is ==>', aml_lgb_score_fold)
    
    ######################
    ## FLAML (CatBoost) ##
    ######################
    
    automl = AutoML()
    
    automl_settings = {'time_budget': 120,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['catboost'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_cat_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_cat_pred_2 = automl.predict_proba(test)[:, 1]
    
    aml_cat_score_fold = roc_auc_score(Y_valid, aml_cat_pred_1)
    aml_cat_cv_scores.append(aml_cat_score_fold)
    aml_cat_preds.append(aml_cat_pred_2)
    
    print('Fold', i+1, '==> FLAML (CatBoost) oof ROC-AUC is ==>', aml_cat_score_fold)
    
    ######################
    ## Average Ensemble ##
    ######################
    
    ens_pred_1 = (aml_lgb_pred_1 + aml_cat_pred_1 ) / 2
    ens_pred_2 = (aml_lgb_pred_2 + aml_cat_pred_2 ) / 2
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    
    print('Fold', i+1, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

In [None]:
flaml_lgb = np.mean(aml_lgb_cv_scores)
flaml_cat = np.mean(aml_cat_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)

print('LGBM: ', flaml_lgb)
print('CAT: ', flaml_cat)
print('ENSEMBLE: ', ens_cv_score)

In [None]:
lgb_preds_test = pd.DataFrame(aml_lgb_preds).apply(np.mean, axis = 0)
cat_preds_test = pd.DataFrame(aml_cat_preds).apply(np.mean, axis = 0)
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

submission['Machine failure'] = lgb_preds_test
submission.to_csv('FLAML_LGBM_submission.csv', index = False)

submission['Machine failure'] = cat_preds_test
submission.to_csv('FLAML_CatBoost_submission.csv', index = False)

submission['Machine failure'] = ens_preds_test
submission.to_csv('FLAML_Ensemble_submission.csv', index = False)

In [54]:
## Defining the input and target variables
X = train.drop(columns = ['id', 'Product ID', 'Machine failure'], axis = 1)
Y = train['Machine failure']

X_test = test.drop(columns = ['id', 'Product ID'], axis = 1)

## Defining lists to store results
lgbm_cv_scores, lgbm_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
cat_cv_scores, cat_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing KFold cross-validation
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------')
    
    ## LightGBM
    lgbm_md = LGBMClassifier(n_estimators = 1000, max_depth = 10, learning_rate = 0.01, 
                             num_leaves = 70, reg_alpha = 3, reg_lambda = 3, subsample = 0.7, 
                             colsample_bytree = 0.7,).fit(X_train, Y_train)
    
    lgbm_pred_valid = lgbm_md.predict_proba(X_valid)[:, 1] 
    lgbm_pred_test = lgbm_md.predict_proba(X_test)[:, 1] 
    
    lgbm_score_fold = roc_auc_score(Y_valid, lgbm_pred_valid)
    
    lgbm_cv_scores.append(lgbm_score_fold)
    lgbm_preds.append(lgbm_pred_test)
    
    print('Fold', i+1, '==> LightGBM oof ROC-AUC score is ==>', lgbm_score_fold)
    
    ## XGBoost
    xgb_md = XGBClassifier(colsample_bytree = 0.7, gamma = 0.8, learning_rate = 0.01, max_depth = 8, 
                           min_child_weight = 20, n_estimators = 1000, subsample = 0.7, 
                           verbosity = 0).fit(X_train, Y_train)
        
    xgb_pred_valid = xgb_md.predict_proba(X_valid)[:, 1]
    xgb_pred_test = xgb_md.predict_proba(X_test)[:, 1]
    
    xgb_score_fold = roc_auc_score(Y_valid, xgb_pred_valid)
    
    xgb_cv_scores.append(xgb_score_fold)
    xgb_preds.append(xgb_pred_test)
    
    print('Fold', i+1, '==> XGBoost oof ROC-AUC score is ==>', xgb_score_fold)
    
    ## CatBoost
    cat_md = CatBoostClassifier(verbose = False).fit(X_train, Y_train)
        
    cat_pred_valid = cat_md.predict_proba(X_valid)[:, 1]
    cat_pred_test = cat_md.predict_proba(X_test)[:, 1]
    
    cat_score_fold = roc_auc_score(Y_valid, cat_pred_valid)
    
    cat_cv_scores.append(cat_score_fold)
    cat_preds.append(cat_pred_test)
    
    print('Fold', i+1, '==> CatBoost oof ROC-AUC score is ==>', cat_score_fold)
    
    ## Ensemble
    ens_pred_valid = (lgbm_pred_valid + xgb_pred_valid + cat_pred_valid) / 3
    ens_pred_test = (lgbm_pred_test + xgb_pred_test + cat_pred_test) / 3
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_valid)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_test)
    
    print('Fold', i+1, '==> Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    
#     X_train_ens = pd.DataFrame({'LGBM': lgbm_pred_valid.tolist(),  
#                                 'XGB': xgb_pred_valid.tolist(), 
#                                 'CAT': cat_pred_valid.tolist()})
#     X_test_ens = pd.DataFrame({'LGBM': lgbm_pred_test.tolist(), 
#                                'XGB': xgb_pred_test.tolist(), 
#                                'CAT': cat_pred_test.tolist()})
    
#     ens_md = RandomForestClassifier(max_depth = 3, n_estimators = 100, max_features = None).fit(X_train_ens, Y_valid)
    
#     ens_pred_valid = ens_md.predict_proba(X_train_ens)[:, 1]
#     ens_pred_test = ens_md.predict_proba(X_test_ens) [:, 1] 
    
#     ens_score_fold = roc_auc_score(Y_valid, ens_pred_valid)
    
#     ens_cv_scores.append(ens_score_fold)
#     ens_preds.append(ens_pred_test)
    
#     print('Fold', i+1, '==> Ensemble oof log-loss is ==>', ens_score_fold)

---------------------------------------------
Fold 1 ==> LightGBM oof ROC-AUC score is ==> 0.96333051013186
Fold 1 ==> XGBoost oof ROC-AUC score is ==> 0.9581906363895127
Fold 1 ==> CatBoost oof ROC-AUC score is ==> 0.9560036411643653
Fold 0 ==> Ensemble oof ROC-AUC score is ==> 0.9621021739115306
---------------------------------------------
Fold 2 ==> LightGBM oof ROC-AUC score is ==> 0.9670710628953038
Fold 2 ==> XGBoost oof ROC-AUC score is ==> 0.9702033584803708
Fold 2 ==> CatBoost oof ROC-AUC score is ==> 0.956769783375245
Fold 1 ==> Ensemble oof ROC-AUC score is ==> 0.9684991790843154
---------------------------------------------
Fold 3 ==> LightGBM oof ROC-AUC score is ==> 0.966418140504742
Fold 3 ==> XGBoost oof ROC-AUC score is ==> 0.9677780202423261
Fold 3 ==> CatBoost oof ROC-AUC score is ==> 0.9586977229115143
Fold 2 ==> Ensemble oof ROC-AUC score is ==> 0.9670575541561887
---------------------------------------------
Fold 4 ==> LightGBM oof ROC-AUC score is ==> 0.96239651

In [55]:
print('LightGBM avg. CV score ==>', np.mean(lgbm_cv_scores))
print('XGBoost avg. CV score ==>', np.mean(xgb_cv_scores))
print('CatBoost avg. CV score ==>', np.mean(cat_cv_scores))
print('Ensemble avg. CV score ==>', np.mean(ens_cv_scores))

LightGBM avg. CV score ==> 0.9636069602674784
XGBoost avg. CV score ==> 0.9639418326980407
CatBoost avg. CV score ==> 0.9589694604850557
Ensemble avg. CV score ==> 0.9643090620945198


In [56]:
## Finalizing submissions data files
lgbm_preds_test = np.mean(lgbm_preds, axis = 0).tolist()
xgb_preds_test = np.mean(xgb_preds, axis = 0).tolist()
cat_preds_test = np.mean(cat_preds, axis = 0).tolist()
ens_preds_test = np.mean(ens_preds, axis = 0).tolist()

sub['Machine failure'] = lgbm_preds_test
sub.to_csv('Submissions/lgbm_2.csv', index = False)

sub['Machine failure'] = xgb_preds_test
sub.to_csv('Submissions/xgb_2.csv', index = False)

sub['Machine failure'] = cat_preds_test
sub.to_csv('Submissions/cat_2.csv', index = False)

sub['Machine failure'] = ens_preds_test
sub.to_csv('Submissions/ens_2.csv', index = False)