# Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

import scipy.stats

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score

from category_encoders import CatBoostEncoder

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from pycaret.classification import setup, evaluate_model, compare_models, plot_model
import optuna

import sys
sys.path.append(r'C:\Users\Anes3\OneDrive\Bureau\python_utils')
import classifier_utils

# Import data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
original = pd.read_csv('original.xls')
sample_submission = pd.read_csv('sample_submission.csv')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])
original = original.drop(columns=['UDI'])

# Data Transformation

In [3]:
Enc = CatBoostEncoder(cols = ['Product ID', 'Type'])

encoded_train = Enc.fit_transform(train.drop('Machine failure', axis=1), train['Machine failure'])
encoded_original = Enc.transform(original.drop('Machine failure', axis=1))
encoded_test = Enc.transform(test)

In [4]:
encoded_train = pd.concat([encoded_train, train['Machine failure']], axis=1)
encoded_original = pd.concat([encoded_original, original['Machine failure']], axis=1)

In [5]:
combo_train = pd.concat([encoded_train, encoded_original])

# Oversampling

In [6]:
print('Shape of train data before oversampling:', encoded_train.shape)
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(encoded_train.drop('Machine failure', axis=1), encoded_train['Machine failure'])
print('Shape of train data after oversampling:', X.shape)

Shape of train data before oversampling: (136429, 13)
Shape of train data after oversampling: (268562, 12)


# Model Selection

In [7]:
# train_final = pd.concat([X, pd.DataFrame(data=y, columns=['Machine failure'])], axis=1)

In [8]:
# _ = setup(data=train_final, target='Machine failure')

In [9]:
# best = compare_models(sort='AUC')

# Rename Columns

In [11]:
X = X.rename(columns={'Product ID': 'Product_ID', 
                      'Air temperature [K]': 'Air_temperature_K', 
                      'Process temperature [K]': 'Process_temperature_K',
                      'Rotational speed [rpm]': 'Rotational_speed_rpm', 
                      'Torque [Nm]': 'Torque_Nm', 
                      'Tool wear [min]': 'Tool_wear_min'})

test_final = encoded_test.rename(columns={'Product ID': 'Product_ID', 
                                          'Air temperature [K]': 'Air_temperature_K', 
                                          'Process temperature [K]': 'Process_temperature_K',
                                          'Rotational speed [rpm]': 'Rotational_speed_rpm', 
                                          'Torque [Nm]': 'Torque_Nm', 
                                          'Tool wear [min]': 'Tool_wear_min'})

In [12]:
catboost = lambda trial: classifier_utils.catboost(trial, X=X, y=y, scoring='roc_auc')
xgboost = lambda trial: classifier_utils.xgb(trial, X=X, y=y, scoring='roc_auc')
et = lambda trial: classifier_utils.et(trial, X=X, y=y, scoring='roc_auc')
rf = lambda trial: classifier_utils.rf(trial, X=X, y=y, scoring='roc_auc')
lightgbm = lambda trial: classifier_utils.lightgbm_binary(trial, X=X, y=y, scoring='roc_auc')

In [13]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(catboost, n_trials=10)
# study.best_params

[32m[I 2023-06-28 16:22:59,739][0m A new study created in memory with name: no-name-4132f79b-19c3-44c9-a46d-173df475672c[0m


{'iterations': 958,
 'depth': 8,
 'learning_rate': 0.08680079283148483,
 'l2_leaf_reg': 1.306080940677865}

In [14]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(rf, n_trials=10)
# study.best_params

{'max_depth': 79,
 'n_estimators': 166,
 'min_samples_leaf': 7,
 'min_samples_split': 80}

In [15]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(et, n_trials=10)
# study.best_params

{'max_depth': 54,
 'n_estimators': 125,
 'min_samples_leaf': 50,
 'min_samples_split': 76,
 'max_features': None}

In [16]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(xgboost, n_trials=10)
# study.best_params

{'n_estimators': 820,
 'max_depth': 6,
 'learning_rate': 0.059547775708231415,
 'subsample': 0.9373367199852446,
 'colsample_bytree': 0.47284101372584764,
 'reg_alpha': 1.140156329269644e-08,
 'reg_lambda': 1.0632393778773188e-07,
 'gamma': 5.870378459213762e-08,
 'min_child_weight': 6}

In [17]:
# study = optuna.create_study(direction='maximize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lightgbm, n_trials=10)
# study.best_params

{'n_estimators': 305,
 'max_depth': 11,
 'num_leaves': 41,
 'learning_rate': 0.04960022092726908,
 'min_child_samples': 22,
 'subsample': 0.8196117141025535,
 'colsample_bytree': 0.6291492182080078,
 'reg_alpha': 1.1444487016750517e-05,
 'reg_lambda': 3.935106524522179e-05}

In [21]:
catboost_params = {
    'iterations': 958,
    'depth': 8,
    'learning_rate': 0.08680079283148483,
    'l2_leaf_reg': 1.306080940677865,
    'logging_level': 'Silent'
}

rf_params = {
    'max_depth': 79,
    'n_estimators': 166,
    'min_samples_leaf': 7,
    'min_samples_split': 80,
    'verbose': 0
}

et_params = {
    'max_depth': 54,
    'n_estimators': 125,
    'min_samples_leaf': 50,
    'min_samples_split': 76,
    'max_features': None,
    'verbose': 0
}

xgb_params = {
    'n_estimators': 820,
    'max_depth': 6,
    'learning_rate': 0.059547775708231415,
    'subsample': 0.9373367199852446,
    'colsample_bytree': 0.47284101372584764,
    'reg_alpha': 1.140156329269644e-08,
    'reg_lambda': 1.0632393778773188e-07,
    'gamma': 5.870378459213762e-08,
    'min_child_weight': 6,
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',
    'verbosity': 0
}

lightgbm_params = {
    'n_estimators': 305,
    'max_depth': 11,
    'num_leaves': 41,
    'learning_rate': 0.04960022092726908,
    'min_child_samples': 22,
    'subsample': 0.8196117141025535,
    'colsample_bytree': 0.6291492182080078,
    'reg_alpha': 1.1444487016750517e-05,
    'reg_lambda': 3.935106524522179e-05,
    'verbose': -1
}

In [22]:
cat_model = CatBoostClassifier(**catboost_params)
rf_model = RandomForestClassifier(**rf_params)
et_model = ExtraTreesClassifier(**et_params)
xgb_model = XGBClassifier(**xgb_params)
lightgbm_model = LGBMClassifier(**lightgbm_params)

# Models Evaluation

In [23]:
models = {
    'cat': cat_model,
    'rf': rf_model,
    'et': et_model,
    'xgb': xgb_model,
    'lightgbm': lightgbm_model
}

In [24]:
results_ensemble_models = {}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

for name, model in models.items():
    res=[]
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        res.append(roc_auc_score(y_test, y_pred))
    results_ensemble_models[name] = res

In [25]:
for name, result in results_ensemble_models.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

----------
cat
0.9996860975484216
6.343712455224433e-05
----------
rf
0.9940120573357978
0.0003685570913911628
----------
et
0.9870239386166645
0.0006556740180882242
----------
xgb
0.9996184200223925
8.291416503034048e-05
----------
lightgbm
0.9993383709144087
0.0001063998572408514


In [27]:
final_model = VotingClassifier(estimators=[('cat', cat_model),
                                           ('rf', rf_model),
                                           ('et', et_model),
                                           ('xgb', xgb_model),
                                           ('lightgbm', lightgbm_model)], 
                               voting='soft')

results_ensemble = []

for i, (train_index, test_index) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict_proba(X_test)[:, 1]
    results_ensemble.append(roc_auc_score(y_test, y_pred))

print(np.mean(results_ensemble))

0.9995349337808486


# Model Training

In [28]:
final_model.fit(X, y)

VotingClassifier(estimators=[('cat',
                              <catboost.core.CatBoostClassifier object at 0x0000024505358BC8>),
                             ('rf',
                              RandomForestClassifier(max_depth=79,
                                                     min_samples_leaf=7,
                                                     min_samples_split=80,
                                                     n_estimators=166)),
                             ('et',
                              ExtraTreesClassifier(max_depth=54,
                                                   max_features=None,
                                                   min_samples_leaf=50,
                                                   min_samples_split=76,
                                                   n_estimators=125)),
                             ('xgb',
                              XGBClassifier(base_score=0...
                                            subsample=0.937

# Prediction

In [29]:
y_pred_1 = final_model.predict(test_final)

# Submission

In [30]:
sample_submission['Machine failure'] = y_pred_1

In [31]:
sample_submission.to_csv('bagging_submission_late_1.csv', index=False)