In [9]:
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (confusion_matrix, 
                             ConfusionMatrixDisplay, 
                             classification_report, 
                             RocCurveDisplay,
                             roc_curve,
                             r2_score,
                             mean_squared_error,
                             auc, accuracy_score)
from sklearn.feature_selection import SequentialFeatureSelector
import tqdm
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from scipy.io.arff import loadarff 

## Pre-settings

Carrega base:

In [10]:
df_data = pd.DataFrame(loadarff(r'C:\Users\LCSJUNI\OneDrive - Embraer\PES\Projeto ML II\dataset_6_letter.arff')[0])
df_data['class'] = df_data['class'].astype(str).str.extract("b'([A-Z])'")

In [11]:
df_data.head()

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx,class
0,2.0,4.0,4.0,3.0,2.0,7.0,8.0,2.0,9.0,11.0,7.0,7.0,1.0,8.0,5.0,6.0,Z
1,4.0,7.0,5.0,5.0,5.0,5.0,9.0,6.0,4.0,8.0,7.0,9.0,2.0,9.0,7.0,10.0,P
2,7.0,10.0,8.0,7.0,4.0,8.0,8.0,5.0,10.0,11.0,2.0,8.0,2.0,5.0,5.0,10.0,S
3,4.0,9.0,5.0,7.0,4.0,7.0,7.0,13.0,1.0,7.0,6.0,8.0,3.0,8.0,0.0,8.0,H
4,6.0,7.0,8.0,5.0,4.0,7.0,6.0,3.0,7.0,10.0,7.0,9.0,3.0,8.0,3.0,7.0,H


Converte coluna de classificação de categórica para numérica:

In [12]:
# Mapeando os valores da classe para inteiro (para fins de visualização da região de decisão)
df_data['class'] = pd.factorize(df_data['class'])[0]

Separa colunas de features (X) e coluna de classificação (y):

In [13]:
# Features
X = df_data.iloc[:,0:-1]
# Class
y = df_data.iloc[:,-1]

Divide conjunto de treinamento  (80%) e conjunto de teste (20%):

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [15]:
# models
models_functions = [
    # ("Decision Tree",DecisionTreeClassifier(criterion='log_loss',max_depth=100,splitter='best')),
    # ("KNN",KNeighborsClassifier(n_neighbors=1,metric='euclidean')),
    # ("Random Forest", RandomForestClassifier(criterion='entropy',max_depth=1000,n_estimators=10)),
    # ("Logistic Regression", LogisticRegression(penalty='l2',C=0.5,solver='newton-cg')),
    # ("Gaussian", GaussianNB()),
    ("MLP", MLPClassifier(activation='tanh', hidden_layer_sizes= (16, 26), learning_rate= 'invscaling')),
    ("SVM", SVC(C= 20, decision_function_shape= 'ovo', kernel= 'rbf'))
]

# SequentialFeatureSelection - Oficial

In [16]:
for models in tqdm.tqdm(models_functions):

    for n_features in range(8,13,4):

        start = time.time()
        name = models[0]
        model = models[1]

        # Nome do classificador
        print("*"*(len(name)+2))
        print(f"*{name} with - {n_features} features*")
        print("*"*(len(name)+2))
        print("-"*50)
        

        kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=199)
        sfs = SequentialFeatureSelector(model, n_features_to_select=n_features, direction='forward', cv=10)
        sfs.fit(X, y)

        best_features_cv = sfs.get_feature_names_out()
        print(f"Best features to {name} with {n_features}: {best_features_cv}")

        # Seleção apenas das melhores features
        X_best_cv = df_data[best_features_cv]

        X_train, X_test, y_train, y_test = train_test_split(X_best_cv, y, test_size=0.2, stratify=y)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        print()
        print(classification_report(y_test, y_pred))
        print()

        print(f"MSE to {name} with - {n_features} features:", mean_squared_error(y_test, y_pred))
        print(f"R2_score {name} with - {n_features} features:", r2_score(y_test, y_pred))
        print(f"Accuracy score {name} with - {n_features} features:", accuracy_score(y_test,y_pred))
        print()
        
        # print(f"Confusion Matrix to {name} with - {n_features} features:")
        # ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred, labels=df_data['class'].unique()),display_labels=df_data['class'].unique()).plot()
        # print()

        end = time.time()

        print(f"Execution time to {name}: {(end-start):.2f}s")
        

  0%|          | 0/2 [00:00<?, ?it/s]

*****
*MLP with - 8 features*
*****
--------------------------------------------------
