
# - Analise dos resultados dos testes -

In [15]:
df = pd.concat([pd.read_csv("ranking_melhores_Decision_Tree.csv"), pd.read_csv("ranking_melhores_KNN.csv"), pd.read_csv("ranking_melhores_MLP.csv"), pd.read_csv("ranking_melhores_Random_Forest.csv"), pd.read_csv("ranking_melhores_SVM.csv")])
df = df.sort_values(by="F1-Score", ascending=False)

display(df)

Unnamed: 0,Encoder,Scaler,Modelo,Melhores Params,Accuracy,Precision,Recall,F1-Score,Erro
0,OrdinalEncoder,StandardScaler,Random Forest,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.951066,0.951026,0.951066,0.950869,
1,OrdinalEncoder,MinMaxScaler,Random Forest,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.949128,0.949053,0.949128,0.94894,
0,OrdinalEncoder,MinMaxScaler,Decision Tree,"{'criterion': 'gini', 'max_depth': 10, 'max_fe...",0.932171,0.932045,0.932171,0.932093,
1,OrdinalEncoder,StandardScaler,Decision Tree,"{'criterion': 'gini', 'max_depth': 10, 'max_fe...",0.931202,0.931165,0.931202,0.931182,
0,OrdinalEncoder,MinMaxScaler,SVM,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.923934,0.924448,0.923934,0.923114,
2,GetDummies,MinMaxScaler,Random Forest,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.922965,0.924891,0.922965,0.921723,
3,GetDummies,StandardScaler,Random Forest,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.921996,0.923399,0.921996,0.920862,
1,OrdinalEncoder,StandardScaler,SVM,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.921512,0.92186,0.921512,0.920708,
0,GetDummies,MinMaxScaler,MLP Neural Net,"{'activation': 'relu', 'alpha': 0.001, 'hidden...",0.916667,0.917189,0.916667,0.915692,
1,OneHotEncoder,MinMaxScaler,MLP Neural Net,"{'activation': 'relu', 'alpha': 0.0001, 'hidde...",0.916667,0.917731,0.916667,0.915503,


Durante esses testes foi definido que o modelo com melhor desempenho foi o Random Forest, levando em consideiração suas metricas, então ele será o utilzado para o modelo preditivo na matéria de ciencias de dados esse modelo, com os parâmetros especificados pelos testes aqui feitos:

# - Testes -

## 1. Importação de bibliotecas

In [2]:
import pandas as pd
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import os

warnings.filterwarnings("ignore", category=ConvergenceWarning)

def salvar_melhores(df, nome_base="ranking_melhores", pasta="."):
    arquivos = [f for f in os.listdir(pasta) if f.startswith(nome_base) and f.endswith(".csv")]

    if not arquivos:
        novo_num = 1
    else:
        nums = [int(f.replace(".csv", "").split("_")[-1]) for f in arquivos]
        novo_num = max(nums) + 1

    nome_arquivo = os.path.join(pasta, f"{nome_base}_{novo_num}.csv")
    df.to_csv(nome_arquivo, index=False)
    print(f"Arquivo salvo em: {nome_arquivo}")


## 2. Carregando o dataset

In [None]:
df = pd.read_csv("base_limpa_1.csv")

X = df.iloc[:, :-1]

y = df.iloc[:, -1]

if y.dtype == "object":
    y = LabelEncoder().fit_transform(y)

df.head()

## 3. Definindo Encoders e Scalers

In [None]:
encoders = {
    "OrdinalEncoder": OrdinalEncoder(),
    "OneHotEncoder": OneHotEncoder(drop="first", sparse_output=False),
    "GetDummies": None,
}

scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler()
}

## 4. Definindo Modelos e Parâmetros

In [None]:
modelos = {
    "Decision Tree": (
        DecisionTreeClassifier(class_weight="balanced"),
        {
            "criterion": ["gini", "entropy"],
            "splitter": ["best", "random"],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": [None, "sqrt"],
        },
    ),
    "Random Forest": (
        RandomForestClassifier(class_weight="balanced"),
        {
            "n_estimators": [100, 200],
            "criterion": ["gini"],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt"],
            "bootstrap": [True],
        },
    ),
    "SVM": (
        SVC(),
        {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale"],
        },
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform"],
            "p": [2],
        },
    ),
    "MLP Neural Net": (
        MLPClassifier(max_iter=2000, early_stopping=True),
        {
            "hidden_layer_sizes": [(50,), (100,)],
            "activation": ["relu", "tanh"],
            "solver": ["adam"],
            "alpha": [0.0001, 0.001],
            "learning_rate": ["constant", "adaptive"],
        },
    ),
}


## 5. Loop de Testes com GridSearchCV

In [None]:
resultados = []

for enc_name, encoder in encoders.items():
    if enc_name == "GetDummies":
        X_enc = pd.get_dummies(X, drop_first=True)
    else:
        X_enc = encoder.fit_transform(X)
        if isinstance(X_enc, np.ndarray):
            X_enc = pd.DataFrame(X_enc)

    for sc_name, scaler in scalers.items():
        X_train, X_test, y_train, y_test = train_test_split(
            X_enc, y, test_size=0.3, random_state=42
        )

        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model_name, (modelo, param_grid) in modelos.items():
            try:
                grid = GridSearchCV(
                    modelo, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1
                )
                grid.fit(X_train, y_train)
                y_pred = grid.predict(X_test)

                resultados.append({
                    "Encoder": enc_name,
                    "Scaler": sc_name,
                    "Modelo": model_name,
                    "Melhores Params": grid.best_params_,
                    "Accuracy": accuracy_score(y_test, y_pred),
                    "Precision": precision_score(y_test, y_pred, average="weighted"),
                    "Recall": recall_score(y_test, y_pred, average="weighted"),
                    "F1-Score": f1_score(y_test, y_pred, average="weighted"),
                })

            except Exception as e:
                resultados.append({
                    "Encoder": enc_name,
                    "Scaler": sc_name,
                    "Modelo": model_name,
                    "Melhores Params": None,
                    "Accuracy": None,
                    "Precision": None,
                    "Recall": None,
                    "F1-Score": None,
                    "Erro": str(e),
                })

## 6. Ranking Final dos Modelos

In [None]:
resultados_df = pd.DataFrame(resultados)
melhores = resultados_df.sort_values(by="F1-Score", ascending=False)

print("Ranking final:")
display(melhores)

In [None]:
salvar_melhores(melhores)