In [None]:
pip install lightgbm xgboost catboost

In [66]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

In [67]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
sub = pd.read_csv('Data/sample_submission.csv')

In [68]:
LE = LabelEncoder()

train['Type'] = LE.fit_transform(train['Type'])
test['Type'] = LE.transform(test['Type'])

In [69]:
train.columns = ['id', 'Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
test.columns = ['id', 'Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 
                 'Tool wear', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']

In [70]:
train.head()

Unnamed: 0,id,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [71]:
test.head()

Unnamed: 0,id,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,TWF,HDF,PWF,OSF,RNF
0,136429,L50896,1,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,136430,L53866,1,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,136431,L50498,1,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,136432,M21232,2,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,136433,M19751,2,303.4,312.3,1515,41.3,114,0,0,0,0,0


In [74]:
## Defining the input and target variables
X = train.drop(columns = ['id', 'Product ID', 'Machine failure'], axis = 1)
Y = train['Machine failure']

X_test = test.drop(columns = ['id', 'Product ID'], axis = 1)

## Defining lists to store results
lgbm_cv_scores, lgbm_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
cat_cv_scores, cat_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing KFold cross-validation
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------')
    
    ## LightGBM
    lgbm_md = LGBMClassifier().fit(X_train, Y_train)
    
    lgbm_pred_valid = lgbm_md.predict_proba(X_valid)[:, 1] 
    lgbm_pred_test = lgbm_md.predict_proba(X_test)[:, 1] 
    
    lgbm_score_fold = roc_auc_score(Y_valid, lgbm_pred_valid)
    
    lgbm_cv_scores.append(lgbm_score_fold)
    lgbm_preds.append(lgbm_pred_test)
    
    print('Fold', i+1, '==> LightGBM oof log-loss is ==>', lgbm_score_fold)
    
    ## XGBoost
    xgb_md = XGBClassifier(verbosity = 0).fit(X_train, Y_train)
        
    xgb_pred_valid = xgb_md.predict_proba(X_valid)[:, 1]
    xgb_pred_test = xgb_md.predict_proba(X_test)[:, 1]
    
    xgb_score_fold = roc_auc_score(Y_valid, xgb_pred_valid)
    
    xgb_cv_scores.append(xgb_score_fold)
    xgb_preds.append(xgb_pred_test)
    
    print('Fold', i+1, '==> XGBoost oof log-loss is ==>', xgb_score_fold)
    
    ## CatBoost
    cat_md = CatBoostClassifier(verbose = False).fit(X_train, Y_train)
        
    cat_pred_valid = cat_md.predict_proba(X_valid)[:, 1]
    cat_pred_test = cat_md.predict_proba(X_test)[:, 1]
    
    cat_score_fold = roc_auc_score(Y_valid, cat_pred_valid)
    
    cat_cv_scores.append(cat_score_fold)
    cat_preds.append(cat_pred_test)
    
    print('Fold', i+1, '==> CatBoost oof log-loss is ==>', cat_score_fold)
    
    ## Ensemble
    X_train_ens = pd.DataFrame({'LGBM': lgbm_pred_valid.tolist(),  
                                'XGB': xgb_pred_valid.tolist(), 
                                'CAT': cat_pred_valid.tolist()})
    X_test_ens = pd.DataFrame({'LGBM': lgbm_pred_test.tolist(), 
                               'XGB': xgb_pred_test.tolist(), 
                               'CAT': cat_pred_test.tolist()})
    
    ens_md = RandomForestClassifier(max_depth = 3, n_estimators = 100, max_features = None).fit(X_train_ens, Y_valid)
    
    ens_pred_valid = ens_md.predict_proba(X_train_ens)[:, 1]
    ens_pred_test = ens_md.predict_proba(X_test_ens) [:, 1] 
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_valid)
    
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_test)
    
    print('Fold', i+1, '==> Ensemble oof log-loss is ==>', ens_score_fold)

---------------------------------------------
Fold 1 ==> LightGBM oof log-loss is ==> 0.9612733079407587
Fold 1 ==> XGBoost oof log-loss is ==> 0.9649892859852057
Fold 1 ==> CatBoost oof log-loss is ==> 0.9595790390861457
Fold 1 ==> Ensemble oof log-loss is ==> 0.9781490121462618
---------------------------------------------
Fold 2 ==> LightGBM oof log-loss is ==> 0.9638992802266697
Fold 2 ==> XGBoost oof log-loss is ==> 0.9643729520405122
Fold 2 ==> CatBoost oof log-loss is ==> 0.9622771923990827
Fold 2 ==> Ensemble oof log-loss is ==> 0.9702987855989913
---------------------------------------------
Fold 3 ==> LightGBM oof log-loss is ==> 0.9701505358466516
Fold 3 ==> XGBoost oof log-loss is ==> 0.9660842321840513
Fold 3 ==> CatBoost oof log-loss is ==> 0.9582993882965827
Fold 3 ==> Ensemble oof log-loss is ==> 0.9750488392875698
---------------------------------------------
Fold 4 ==> LightGBM oof log-loss is ==> 0.9578918746666112
Fold 4 ==> XGBoost oof log-loss is ==> 0.95813191456

In [75]:
print('LightGBM avg. CV score ==>', np.mean(lgbm_cv_scores))
print('XGBoost avg. CV score ==>', np.mean(xgb_cv_scores))
print('CatBoost avg. CV score ==>', np.mean(cat_cv_scores))
print('Ensemble avg. CV score ==>', np.mean(ens_cv_scores))

LightGBM avg. CV score ==> 0.9626074186439345
XGBoost avg. CV score ==> 0.9616126099957016
CatBoost avg. CV score ==> 0.9601051018698147
Ensemble avg. CV score ==> 0.9693971659396858


In [76]:
lgbm_preds_test = np.mean(lgbm_preds, axis = 0).tolist()
xgb_preds_test = np.mean(xgb_preds, axis = 0).tolist()
cat_preds_test = np.mean(cat_preds, axis = 0).tolist()
ens_preds_test = np.mean(ens_preds, axis = 0).tolist()

In [78]:
## Finalizing submissions data file
sub['Machine failure'] = lgbm_preds_test
sub.to_csv('Submissions/lgbm_1.csv', index = False)

sub['Machine failure'] = xgb_preds_test
sub.to_csv('Submissions/xgb_1.csv', index = False)

sub['Machine failure'] = cat_preds_test
sub.to_csv('Submissions/cat_1.csv', index = False)

sub['Machine failure'] = ens_preds_test
sub.to_csv('Submissions/ens_1.csv', index = False)