# Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

import scipy.stats

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score

from category_encoders import CatBoostEncoder

# Import data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
original = pd.read_csv('original.xls')
sample_submission = pd.read_csv('sample_submission.csv')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])
original = original.drop(columns=['UDI'])

# Data Cleaning

In [3]:
train.isnull().sum()

Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

# Data Transformation

In [4]:
Enc = CatBoostEncoder(cols = ['Product ID', 'Type'])

encoded_train = Enc.fit_transform(train.drop('Machine failure', axis=1), train['Machine failure'])
encoded_original = Enc.transform(original.drop('Machine failure', axis=1))
encoded_test = Enc.transform(test)

In [5]:
encoded_train = pd.concat([encoded_train, train['Machine failure']], axis=1)
encoded_original = pd.concat([encoded_original, original['Machine failure']], axis=1)

In [6]:
combo_train = pd.concat([encoded_train, encoded_original])

# Oversampling

In [7]:
print('Shape of train data before oversampling:', encoded_train.shape)
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(encoded_train.drop('Machine failure', axis=1), encoded_train['Machine failure'])
print('Shape of train data after oversampling:', X.shape)

Shape of train data before oversampling: (136429, 13)
Shape of train data after oversampling: (268562, 12)


# Base Model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [10]:
model = RandomForestClassifier(random_state=0)

In [11]:
model.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [12]:
y_pred = model.predict_proba(X_test)[:, 1]

# Model Evaluation

In [13]:
# Calculate the area under the ROC curve
roc_auc = roc_auc_score(y_test, y_pred)
print("Area under ROC curve:", roc_auc)

Area under ROC curve: 0.9993607849279302


# Prediction

In [14]:
model = RandomForestClassifier(random_state=0)
model.fit(X, y)

RandomForestClassifier(random_state=0)

In [16]:
# test_final = pd.DataFrame(scaler.transform(encoded_test), index=encoded_test.index, columns=encoded_test.columns)
y_pred_1 = model.predict(encoded_test)

# Submission

In [17]:
sample_submission['Machine failure'] = y_pred_1

In [18]:
sample_submission.to_csv('base_submission_late_4.csv', index=False)

In [19]:
# late_1 no scaling and no oversampling                       Private: 0.90572    Public: 0.91618
# late_2 oversampling and no scaling                          Private: 0.918      Public: 0.9216
# late_3 oversampling and scaling                             Private: 0.83053    Public: 0.8364
# late_4 oversampling and no scaling, train on all data       Private: 0.91963    Public: 0.91317