#  UWAAGA TE RZECZY SIE DLUGO ROBIą!!!

## 1. Packages

In [25]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB



from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')

## 2. Wczytywanie danych

In [5]:
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

X_train = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns

## 3. Scaling

In [58]:
scaling = sklearn.preprocessing.PowerTransformer(method='box-cox')
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)

## 4. Encoding

In [59]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))

#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))


In [9]:
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']

## 5. Strojenie parametrów i kroswalidacja

### 5.1 Strojenie parametrów

In [6]:
def hyperparameters_tuner(estimator, param_distributions, X, y, cv=5, n_iter=10, random_state=42):
    random_search = RandomizedSearchCV(estimator, param_distributions=param_distributions, n_iter=n_iter, cv=cv, random_state=random_state)
    random_search.fit(X, y)
    return random_search.best_params_

In [7]:
def plot_confusion_matrix (y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred, labels=class_names)
    fig = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count",text_auto='.2f'), x=class_names, y=class_names)
    fig.update_xaxes(side="top")
    for i in range(len(class_names)):
        for j in range(len(class_names)):
            fig.add_annotation(x=class_names[j], y=class_names[i], text=str(cm[i, j]), showarrow=False)
    
    fig.show()

### 5.2 Kroswalidacja

In [8]:
def train_evaluate_encoded2(estimator, param_distributions, X_train, y_train, X_val, y_val, cv=5, class_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']):
    best_params = hyperparameters_tuner(estimator, param_distributions, X_train, y_train)
    best_model = estimator.set_params(**best_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_val)
    

    accuracy = accuracy_score(y_val, y_pred)

    cv_results = cross_val_score(best_model, X_train, y_train, cv=cv)
    cv_val_results = cross_val_score(best_model, X_val, y_val, cv=cv)

    y_pred = labelencoder.inverse_transform(y_pred)
    y_val = labelencoder.inverse_transform(y_val)

    print(f"Best parameters: {best_params}")
    print('__________________________________________________________')
    print(f"Accuracy: {accuracy}")
    plot_confusion_matrix(y_val, y_pred, class_names)
    print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
    print("__________________________________________________________")
    print(f"Cross-validation results: {cv_results}")
    print(f"Mean accuracy: {cv_results.mean()}")
    print(f"Cross-validation results on validation set: {cv_val_results}")
    print(f"Mean accuracy on validation set: {cv_val_results.mean()}")



In [19]:
def train_evaluate(estimator, param_distributions, X_train, y_train, X_val, y_val, cv=5, class_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']):
    best_params = hyperparameters_tuner(estimator, param_distributions, X_train, y_train)
    best_model = estimator.set_params(**best_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_val)
    

    accuracy = accuracy_score(y_val, y_pred)

    cv_results = cross_val_score(best_model, X_train, y_train, cv=cv)
    cv_val_results = cross_val_score(best_model, X_val, y_val, cv=cv)

    print(f"Best parameters: {best_params}")
    print('__________________________________________________________')
    print(f"Accuracy: {accuracy}")
    plot_confusion_matrix(y_val, y_pred, class_names)
    print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
    print("__________________________________________________________")
    print(f"Cross-validation results: {cv_results}")
    print(f"Mean accuracy: {cv_results.mean()}")
    print(f"Cross-validation results on validation set: {cv_val_results}")
    print(f"Mean accuracy on validation set: {cv_val_results.mean()}")

## 6. Modelowanie

### 6.1 Regresja logistyczna

In [64]:
dist = dict(C=[10 ** x for x in range(-4, 3)], penalty=['l2', 'l1'])
lr = LogisticRegression(max_iter=1000,solver='saga', multi_class='multinomial')
train_evaluate_encoded2(lr, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)



Best parameters: {'penalty': 'l1', 'C': 100}
__________________________________________________________
Accuracy: 0.9291338582677166


              precision    recall  f1-score   support

    BARBUNYA       0.94      0.92      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.96      0.92      0.94       276
    DERMASON       0.93      0.93      0.93       604
       HOROZ       0.93      0.95      0.94       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.87      0.88      0.88       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92590164 0.93110236 0.93044619 0.92519685 0.92650919]
Mean accuracy: 0.927831246504023
Cross-validation results on validation set: [0.91484716 0.92778993 0.93435449 0.95185996 0.91466083]
Mean accuracy on validation set: 0.9287024738899028


model dziala dobrze, crossvalidacja nie wykrywa over ani underfittingu

### 6.2 Random Forest

In [21]:
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

X_train_filtered = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val_filtered = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test_filtered = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns

In [22]:
scaling = sklearn.preprocessing.StandardScaler()

X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)

In [23]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))

#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))

In [68]:
dist = dict(n_estimators=[5, 10, 25, 50, 100, 200, 250, 500, 1000],
            criterion=['gini', 'entropy', 'log_loss'],
            max_depth=[1, 5, 10, 25, 50, 100, 150],
            min_samples_split=[1, 5, 10, 25, 50, 100, 250, 500])
rf = RandomForestClassifier(random_state=42)
train_evaluate_encoded2(rf, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)

Best parameters: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 25, 'criterion': 'log_loss'}
__________________________________________________________
Accuracy: 0.9243219597550306


              precision    recall  f1-score   support

    BARBUNYA       0.93      0.93      0.93       222
      BOMBAY       1.00      0.99      0.99        85
        CALI       0.95      0.92      0.94       276
    DERMASON       0.91      0.93      0.92       604
       HOROZ       0.94      0.95      0.94       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.87      0.86      0.87       441

    accuracy                           0.92      2286
   macro avg       0.94      0.93      0.94      2286
weighted avg       0.92      0.92      0.92      2286

__________________________________________________________
Cross-validation results: [0.9147541  0.92782152 0.92454068 0.92388451 0.92716535]
Mean accuracy: 0.9236332343702939
Cross-validation results on validation set: [0.91266376 0.9059081  0.92997812 0.92997812 0.89496718]
Mean accuracy on validation set: 0.9146990530610685


### 6.3 SVC

In [17]:
svc = SVC()
dist = {'C': [0.1, 1, 10, 100, 1000,10000], 'gamma': [1, 0.01, 0.0001], 'kernel': ['rbf'
                                                                                   #, 'poly', 'sigmoid'
                                                                                   ]}
train_evaluate_encoded2(svc, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)

Best parameters: {'kernel': 'rbf', 'gamma': 0.01, 'C': 10000}
__________________________________________________________
Accuracy: 0.9278215223097113


              precision    recall  f1-score   support

    BARBUNYA       0.92      0.92      0.92       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.95      0.91      0.93       276
    DERMASON       0.91      0.95      0.93       604
       HOROZ       0.94      0.95      0.94       319
       SEKER       0.97      0.95      0.96       339
        SIRA       0.89      0.86      0.88       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92459016 0.93569554 0.92913386 0.93175853 0.92782152]
Mean accuracy: 0.9297999225506647
Cross-validation results on validation set: [0.89519651 0.90809628 0.93654267 0.92560175 0.9059081 ]
Mean accuracy on validation set: 0.9142690606098247


In [21]:
from xgboost import XGBClassifier
xgb = XGBClassifier(subsample=0.8, nestimators=300, ma_depth=7, learning_rate=0.1, gamma=0.1, colsample_bytree=0.5)
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 5, 7],  # Maximum depth of the tree
    'learning_rate': [0.01, 0.1, 0.3],  # Step size shrinkage used in update to prevent overfitting
    'subsample': [0.5, 0.8, 1.0],  # Subsample ratio of the training instance
    'colsample_bytree': [0.5, 0.8, 1.0],  # Subsample ratio of columns when constructing each tree
    'gamma': [0, 0.1, 0.2]  # Minimum loss reduction required to make a further partition on a leaf node of the tree
}
train_evaluate_encoded2(xgb, param_grid, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)




Best parameters: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
__________________________________________________________
Accuracy: 0.9282589676290464


              precision    recall  f1-score   support

    BARBUNYA       0.94      0.92      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.97      0.92      0.94       276
    DERMASON       0.91      0.94      0.93       604
       HOROZ       0.93      0.96      0.95       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.88      0.86      0.87       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92721311 0.93897638 0.92847769 0.92716535 0.92388451]
Mean accuracy: 0.9291434103523946
Cross-validation results on validation set: [0.90829694 0.89715536 0.93435449 0.93654267 0.89277899]
Mean accuracy on validation set: 0.9138256906156537


### 6.4 Naive Bayes

In [26]:
nb = GaussianNB()
dist = {'var_smoothing': np.logspace(0,-9, num=100)}
train_evaluate_encoded2(nb, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)

Best parameters: {'var_smoothing': 2.848035868435799e-08}
__________________________________________________________
Accuracy: 0.9002624671916011


              precision    recall  f1-score   support

    BARBUNYA       0.89      0.85      0.87       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.91      0.89      0.90       276
    DERMASON       0.93      0.88      0.90       604
       HOROZ       0.91      0.96      0.93       319
       SEKER       0.93      0.96      0.94       339
        SIRA       0.82      0.85      0.84       441

    accuracy                           0.90      2286
   macro avg       0.91      0.91      0.91      2286
weighted avg       0.90      0.90      0.90      2286

__________________________________________________________
Cross-validation results: [0.89180328 0.89895013 0.9015748  0.89238845 0.89501312]
Mean accuracy: 0.8959459575749753
Cross-validation results on validation set: [0.8930131  0.9059081  0.90809628 0.9059081  0.88402626]
Mean accuracy on validation set: 0.8993903662580145


Słabo ale co zrobic, nie bedziemy dalej uzywac

### 6.5 DecisionTree

In [30]:
dt = DecisionTreeClassifier()
dist  = dict(criterion=['gini', 'entropy', 'log_loss'],
            max_depth=[1, 5, 7, 9, 10,11,15, 20, 50, 75, 100],
            min_samples_split=[1,3,4,5,7, 10, 100, 250, 500])
train_evaluate_encoded2(dt, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)

Best parameters: {'min_samples_split': 7, 'max_depth': 10, 'criterion': 'entropy'}
__________________________________________________________
Accuracy: 0.899825021872266


              precision    recall  f1-score   support

    BARBUNYA       0.89      0.89      0.89       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.92      0.91      0.91       276
    DERMASON       0.90      0.91      0.90       604
       HOROZ       0.91      0.93      0.92       319
       SEKER       0.94      0.92      0.93       339
        SIRA       0.85      0.83      0.84       441

    accuracy                           0.90      2286
   macro avg       0.91      0.91      0.91      2286
weighted avg       0.90      0.90      0.90      2286

__________________________________________________________
Cross-validation results: [0.89901639 0.90616798 0.9015748  0.89566929 0.89632546]
Mean accuracy: 0.8997507852502042
Cross-validation results on validation set: [0.8930131  0.88183807 0.89934354 0.87527352 0.85776805]
Mean accuracy on validation set: 0.881447259037008
