In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import matplotlib.pyplot as plt

In [None]:
!py -m pip install scikit-learn



In [None]:
df = pd.read_csv('/kaggle/input/predictive-maintenance-dataset-ai4i-2020/ai4i2020.csv')

In [None]:
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df["Machine failure"].value_counts()

In [None]:
print(df.columns)

In [None]:
df.isnull().sum()

In [None]:
df["Type"].value_counts()

# EDA

In [None]:
### Convert 'Type' column to numerical using Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])
print(df['Type'].value_counts())


In [None]:
df = df.drop('Product ID', axis=1)

In [None]:
### check the correlaction between each points plot it with heat map consider on thing dont add product id

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your DataFrame is named 'df' and already loaded as shown in your previous code.
# Drop the 'Product ID' column


# Calculate the correlation matrix
correlation_matrix = df.corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Features')
plt.show()


In [None]:
def draw_histograms(dataframe, features, rows, cols):#defining a fuction for histplot
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor='Blue')
        ax.set_title(feature,fontsize=20,color='darkgreen')

    fig.tight_layout()
    plt.show()

In [None]:
draw_histograms(df,df.columns,5,3)#calling the function

In [None]:
# univeriate analysis barplot with variables "Type" with respect to variable machine failure

# Create the bar plot
plt.figure(figsize=(8, 6))
sns.countplot(x='Type', hue='Machine failure', data=df)
plt.title('Univariate Analysis: Type vs. Machine Failure')
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(x='Tool wear [min]', hue='Machine failure', data=df[df['Machine failure'] == 1])
plt.title('Tool Wear Distribution for Machine Failures')
plt.xlabel('Tool Wear [min]')
plt.ylabel('Frequency')
plt.show()


In [None]:
sns.countplot(x="Machine failure", hue="Machine failure", data=df)

In [None]:
df.drop("UDI", axis=1, inplace=True)

In [None]:
df = df.drop(['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1)

In [None]:
df.head()

In [None]:
X = df.drop(["Machine failure"], axis = 1)
y = df["Machine failure"] # target column

In [3]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority', random_state = 1)

In [None]:
X1,y1 = oversample.fit_resample(X, y)# Balancing the dependent variable(y) and independent variable(X)

In [None]:
overdf = pd.DataFrame(y1.value_counts())
overdf

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Model Building

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
class Model:# Creating a class for applyng algorithms on dataset
    scores = {'Model':[], 'Accuracy':[], 'CV_Score':[], 'auc':[]}

    def __init__(self, model, model_name):# Initializing
        self.model = model
        self.model_name = model_name

    def predict(self):
        self.model.fit(x_train, y_train)
        pred = self.model.predict(x_test)
        cv_score = np.mean(cross_val_score(self.model, x_test, y_test, cv=5))
        auc = roc_auc_score(y_test, pred)
        fpr, tpr, thresholds = roc_curve(y_test, pred)

        self.performance(pred, cv_score, auc)
        plt.show()
        self.plot_roc_curve(fpr, tpr)

    def performance(self, pred, cv_score, auc):
        accuracy = accuracy_score(pred, y_test)

        Model.scores['Model'].append(self.model_name)
        Model.scores['Accuracy'].append(accuracy)
        Model.scores['CV_Score'].append(cv_score)
        Model.scores['auc'].append(auc)

        print(f'Accuracy Score: {accuracy}')
        print(f'Mean Cross Validation Score: {cv_score}\n')
        print(f'Classification Report\n{classification_report(pred, y_test)}')

        self.confusion_matrix(pred)

    def confusion_matrix(self, pred):
        cm = confusion_matrix(y_test, pred)
        return sns.heatmap(cm, annot=True, fmt='d', cmap="YlGnBu")

    ## visualizing the roc plot
    def plot_roc_curve(self, fpr, tpr):# function to plot roc curve
        plt.plot(fpr, tpr, color='orange', label='ROC')#line plot between fpr and tpr
        plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
        plt.xlabel('False Positive Rate')# assigning name to  x axis
        plt.ylabel('True Positive Rate')# assigning name to y axis
        plt.title('Receiver Operating Characteristic (ROC) Curve')#assigning name to curve
        plt.legend()#area describing the elements of the graph
        plt.show()#to show graph without location

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
model = Model(LogisticRegression(random_state = 20), 'Logistic Regression')
model.predict()

# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = Model(KNeighborsClassifier(), 'KNN')
model.predict()

# DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = Model(DecisionTreeClassifier(random_state = 20), 'Decision Tree')
model.predict()

# GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
model = Model(GaussianNB(), 'Gaussian NB')
model.predict()

# SVM

In [None]:
from sklearn.svm import SVC

model = Model(SVC(random_state = 20), 'SVC')
model.predict()

# GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model = Model(GradientBoostingClassifier(random_state = 20), 'GradientBoostingClassifier')
model.predict()

In [None]:
performance= pd.DataFrame(Model.scores)
performance.sort_values(by='Accuracy', ascending=False, inplace=True)
performance.reset_index(drop = True, inplace = True)
performance