In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("Dataset No.1.csv")

In [3]:
df.head()

Unnamed: 0,ASTM Material,C,Mn,Si,P,S,Cr,Mo,Ni,N,...,Sulfuric acid (H2SO4),Nitric acid (HNO3),Hydrocloric acid (HCl),Citric acid(HOC(CH2COOH)2COOH.H2O,KHSO4,KNO3,MgCl2.6H2O,Temperature,Behavior,Pitting_label
0,403,0.15,1.0,0.5,0.04,0.03,11.5,0.0,0.6,0.0,...,0.0,0,0.0,0,0,0,0,20.0,Poor,0
1,405,0.08,1.0,1.0,0.04,0.03,11.5,0.0,0.6,0.0,...,0.0,0,0.0,0,0,0,0,20.0,Poor,0
2,S41050,0.04,1.0,1.0,0.045,0.03,10.5,0.0,0.6,0.1,...,0.0,0,0.0,0,0,0,0,20.0,Poor,0
3,416,0.15,1.25,1.0,0.06,0.35,12.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0,0,0,20.0,Poor,0
4,410,0.15,1.0,1.0,0.04,0.03,11.5,0.0,0.75,0.0,...,0.0,0,0.0,0,0,0,0,20.0,Poor,0


In [4]:
df = df.drop("ASTM Material", axis = 1)

In [5]:
df["Behavior"].unique()

array(['Poor', 'Good', 'Resistant', 'Severe'], dtype=object)

In [6]:
df.isna().sum()

C                                    0
Mn                                   0
Si                                   0
P                                    0
S                                    0
Cr                                   0
Mo                                   0
Ni                                   0
N                                    0
Ti                                   0
Nb                                   0
Al                                   0
Fe balance (%)                       0
Formic acid (HCOOH)                  0
Ammonium chloride(NH4Cl)             0
Acetic acid(CH3COOH)                 0
Potasium Hydroxide (KOH)             0
Lactic acid                          0
oxalic acid (COOH)2.2H2O             0
Phosphric acid (H3PO4)               0
Sulfuric acid (H2SO4)                0
Nitric acid (HNO3)                   0
Hydrocloric acid (HCl)               0
Citric acid(HOC(CH2COOH)2COOH.H2O    0
KHSO4                                0
KNO3                     

In [7]:
def preprocess_data(df):
    
    # Separate features and target variable
    X = df.drop('Behavior', axis=1)
    y = df['Behavior']
    
    # One-hot encode the "Behavior" column
    y = pd.get_dummies(y, columns=['Behavior'])
    print(y.columns)
    
    # Min-max scale numerical features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y


In [8]:
# Assuming df is your DataFrame
X_scaled, y = preprocess_data(df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)



Index(['Good', 'Poor', 'Resistant', 'Severe'], dtype='object')


In [9]:
def train_classifier(X_train, y_train, model):
    model = MultiOutputClassifier(model)
    model.fit(X_train, y_train)
    return model.predict(X_test)


In [10]:
model = LogisticRegression()

y_pred = train_classifier(X_train, y_train, model)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       150
           1       0.00      0.00      0.00       141
           2       0.77      0.63      0.69       247
           3       0.79      0.65      0.71       266

   micro avg       0.77      0.41      0.53       804
   macro avg       0.39      0.32      0.35       804
weighted avg       0.50      0.41      0.45       804
 samples avg       0.40      0.41      0.41       804



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
model = RandomForestClassifier(n_estimators=400)

y_pred = train_classifier(X_train, y_train, model)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.53      0.64       150
           1       0.79      0.67      0.73       141
           2       0.91      0.97      0.94       247
           3       0.94      0.88      0.91       266

   micro avg       0.89      0.81      0.85       804
   macro avg       0.87      0.76      0.80       804
weighted avg       0.88      0.81      0.84       804
 samples avg       0.80      0.81      0.80       804



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
model = GradientBoostingClassifier(n_estimators = 400)

y_pred = train_classifier(X_train, y_train, model)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.57      0.69       150
           1       0.74      0.63      0.68       141
           2       0.99      0.98      0.99       247
           3       0.99      0.97      0.98       266

   micro avg       0.93      0.84      0.88       804
   macro avg       0.89      0.79      0.83       804
weighted avg       0.92      0.84      0.87       804
 samples avg       0.83      0.84      0.83       804



  _warn_prf(average, modifier, msg_start, len(result))
