In [2]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier



from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('../../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

df_for_validators = pd.read_csv('../../Data/validator_data.csv')
X_for_validators = df_for_validators.drop('Class', axis=1)
y_for_validators = df_for_validators['Class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

X_train = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_for_validators = X_for_validators.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns




scaling = sklearn.preprocessing.PowerTransformer(method='box-cox')
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)
X_for_validators = scaling.transform(X_for_validators)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)
X_for_validators = pd.DataFrame(X_for_validators, columns=cols)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_for_validators_encoded = pd.DataFrame(enc.transform(y_for_validators.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))

#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))
y_for_validators_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_for_validators))


class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']

In [4]:
y_encoded

Unnamed: 0,Class_BARBUNYA,Class_BOMBAY,Class_CALI,Class_DERMASON,Class_HOROZ,Class_SEKER,Class_SIRA
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
7616,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7617,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7618,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7619,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [5]:
y_encoded2

Unnamed: 0,0
0,6
1,5
2,3
3,4
4,4
...,...
7616,6
7617,5
7618,0
7619,3


In [7]:
# create list of model with random state 42
# create logistic regression with l1 penalty and c=100
# create svm with rbf kernel and c=1000 and gamma=0.1
#create random forest with n_estimators=200 and max_depth=25, min_samples_split=5, criterion='log_loss'
# create decision tree with min_samples_split=10, criterion='entropy', max_depth=20
models=[LogisticRegression(penalty='l2', C=100, random_state=42),
        SVC(kernel='rbf', C=1000, gamma=0.1, random_state=42),
        RandomForestClassifier(n_estimators=200, max_depth=25, min_samples_split=5, criterion='gini', random_state=42),
        DecisionTreeClassifier(min_samples_split=10, criterion='entropy', max_depth=20, random_state=42)]

names=['Logistic Regression', 'SVM', 'Random Forest', 'Decision Tree']

for model, name in zip(models, names):
    model.fit(X_train, y_encoded2)
    y_pred = model.predict(X_val)
    print(name)
    print(classification_report(y_val_encoded2, y_pred,target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))



Logistic Regression
              precision    recall  f1-score   support

    BARBUNYA       0.93      0.92      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.97      0.91      0.94       276
    DERMASON       0.93      0.92      0.93       604
       HOROZ       0.92      0.95      0.94       319
       SEKER       0.95      0.96      0.96       339
        SIRA       0.86      0.88      0.87       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286
SVM
              precision    recall  f1-score   support

    BARBUNYA       0.91      0.92      0.92       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.93      0.91      0.92       276
    DERMASON       0.89      0.94      0.91       604
       HOROZ       0.94      0.93      0.94       319
       SEKER       0.97      0.92      0.95       339
 