In [2]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

X_train_filtered = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val_filtered = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test_filtered = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns

In [4]:
scaling = sklearn.preprocessing.PowerTransformer(method='box-cox')
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)

In [5]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))

#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))


In [6]:
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']

In [18]:
reg = LogisticRegression(max_iter=50, 
                         solver='saga',
                           multi_class='multinomial',
                             penalty='l1',
                               C=0.1, 
                               fit_intercept=True,
                                 tol=1e-4,
                                 )
reg.fit(X_train, y_encoded2)
y_pred = reg.predict(X_val)
print("BEFORE FEATURE SELECTION:\n")
print(f"Accuracy: {reg.score(X_val, y_val_encoded2)}")
report = classification_report(y_val_encoded2, y_pred)
print("Report:\n", report)

y_pred = labelencoder.inverse_transform(y_pred)
cm1 = confusion_matrix(y_val, y_pred, labels=class_names)
#print(f"confusion matrix:")
#print(cm)

# Plot confusion matrix
fig = px.imshow(cm1, labels=dict(x="Predicted", y="Actual", color="Count"), x=class_names, y=class_names,text_auto='.2f')
fig.update_xaxes(side="top")
fig.show()


BEFORE FEATURE SELECTION:

Accuracy: 0.926071741032371
Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.93       222
           1       1.00      1.00      1.00        85
           2       0.94      0.91      0.93       276
           3       0.93      0.93      0.93       604
           4       0.92      0.95      0.94       319
           5       0.96      0.96      0.96       339
           6       0.87      0.88      0.87       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286



## Strojenie parametrów

In [8]:
def hyperparameters_tuner(estimator, param_distributions, X, y, cv=5, n_iter=10, random_state=42):
    random_search = RandomizedSearchCV(estimator, param_distributions=param_distributions, n_iter=n_iter, cv=cv, random_state=random_state)
    random_search.fit(X, y)
    return random_search.best_params_

### Regresja logistyczna

In [None]:
dist = dict(C=[10 ** x for x in range(-4, 3)], penalty=['l2', 'l1'])
