In [32]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder

In [21]:
df = pd.read_csv('/kaggle/input/steel-plate-fault/faults.csv')
df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
0,42,50.0,270900,270944,267,17,44,24220,76,108,...,0.4706,1.0,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,Pastry
1,645,651.0,2538079,2538108,108,10,30,11397,84,123,...,0.6,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,Pastry
2,829,835.0,1553913,1553931,71,8,19,7972,99,125,...,0.75,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.215,Pastry
3,853,860.0,369370,369415,176,13,45,18996,99,126,...,0.5385,1.0,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,Pastry
4,1289,1306.0,498078,498335,2409,60,260,246930,37,126,...,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0,Pastry


In [22]:
df.dropna(inplace=True)

In [23]:
df['target'].unique()

array(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'], dtype=object)

In [25]:
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

In [27]:
X = df.drop('target', axis = 1)
y = df['target']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
# Create a Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression")
print(classification_report(y_test, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))

Logistic Regression
              precision    recall  f1-score   support

           0       0.63      0.61      0.62        76
           1       0.55      0.60      0.57        10
           2       0.96      0.93      0.94        95
           3       0.64      0.62      0.63       128
           4       0.50      0.44      0.47        18
           5       0.83      1.00      0.91        10
           6       0.76      0.92      0.83        37

    accuracy                           0.73       374
   macro avg       0.69      0.73      0.71       374
weighted avg       0.73      0.73      0.73       374

Accuracy: 0.7272727272727273


In [30]:
# Create a Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_clf.predict(X_test)

# Evaluate the model
print("Random Forest")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest
              precision    recall  f1-score   support

           0       0.69      0.72      0.71        76
           1       0.89      0.80      0.84        10
           2       0.96      0.97      0.96        95
           3       0.73      0.77      0.75       128
           4       0.70      0.39      0.50        18
           5       1.00      0.90      0.95        10
           6       0.94      0.92      0.93        37

    accuracy                           0.81       374
   macro avg       0.84      0.78      0.81       374
weighted avg       0.81      0.81      0.81       374

Accuracy: 0.8101604278074866


In [33]:
# Create a Support Vector Machine model
svm_clf = SVC()

# Train the model
svm_clf.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_clf.predict(X_test)

# Evaluate the model
print("Support Vector Machine")
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))

Support Vector Machine
              precision    recall  f1-score   support

           0       0.63      0.76      0.69        76
           1       0.88      0.70      0.78        10
           2       0.98      0.93      0.95        95
           3       0.73      0.73      0.73       128
           4       0.71      0.28      0.40        18
           5       0.91      1.00      0.95        10
           6       0.89      0.92      0.91        37

    accuracy                           0.79       374
   macro avg       0.82      0.76      0.77       374
weighted avg       0.80      0.79      0.79       374

Accuracy: 0.7887700534759359


In [34]:
from sklearn.pipeline import make_pipeline

# Pipeline for Logistic Regression
pipe_log_reg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
pipe_log_reg.fit(X_train, y_train)
y_pred_pipe_log_reg = pipe_log_reg.predict(X_test)
print("Pipeline Logistic Regression")
print(classification_report(y_test, y_pred_pipe_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_pipe_log_reg))

# Pipeline for Random Forest
pipe_rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
pipe_rf.fit(X_train, y_train)
y_pred_pipe_rf = pipe_rf.predict(X_test)
print("Pipeline Random Forest")
print(classification_report(y_test, y_pred_pipe_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_pipe_rf))

# Pipeline for Support Vector Machine
pipe_svm = make_pipeline(StandardScaler(), SVC())
pipe_svm.fit(X_train, y_train)
y_pred_pipe_svm = pipe_svm.predict(X_test)
print("Pipeline Support Vector Machine")
print(classification_report(y_test, y_pred_pipe_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_pipe_svm))


Pipeline Logistic Regression
              precision    recall  f1-score   support

           0       0.63      0.61      0.62        76
           1       0.55      0.60      0.57        10
           2       0.96      0.93      0.94        95
           3       0.64      0.62      0.63       128
           4       0.50      0.44      0.47        18
           5       0.83      1.00      0.91        10
           6       0.76      0.92      0.83        37

    accuracy                           0.73       374
   macro avg       0.69      0.73      0.71       374
weighted avg       0.73      0.73      0.73       374

Accuracy: 0.7272727272727273
Pipeline Random Forest
              precision    recall  f1-score   support

           0       0.69      0.72      0.71        76
           1       0.89      0.80      0.84        10
           2       0.96      0.97      0.96        95
           3       0.73      0.77      0.75       128
           4       0.70      0.39      0.50        