# - Importação de bibliotecas -

In [None]:
import pandas as pd
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import pickle
import ast

import os

warnings.filterwarnings("ignore", category=ConvergenceWarning)

def salvar_melhores(df, nome_base="ranking_melhores", pasta="."):
    for modelo in df['Modelo'].unique():
        df_modelo = df[df['Modelo'] == modelo]
        if not df_modelo.empty:
            nome_arquivo = os.path.join(pasta, f"{nome_base}_{modelo}.csv")
            df.to_csv(nome_arquivo, index=False)
            print(f"Arquivo salvo em: {nome_arquivo}")


# - Analise dos resultados dos testes -

In [None]:
df = pd.concat([pd.read_csv("ranking_melhores_Decision_Tree.csv"), pd.read_csv("ranking_melhores_KNN.csv"), pd.read_csv("ranking_melhores_MLP.csv"), pd.read_csv("ranking_melhores_Random_Forest.csv"), pd.read_csv("ranking_melhores_SVM.csv")])
df = df.sort_values(by="F1-Score", ascending=False)

display(df)

Durante esses testes foi definido que o modelo com melhor desempenho foi o Random Forest, levando em consideiração suas metricas, então ele será o utilzado para o modelo preditivo na matéria de ciencias de dados esse modelo, com os parâmetros especificados pelos testes aqui feitos:

# - Criando PKL para o melhor modelo de cada um -

In [None]:
modelos = {
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced"),
    "Random Forest": RandomForestClassifier(class_weight="balanced"),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "MLP Neural Net": MLPClassifier(max_iter=2000, early_stopping=True),
}

csv_files = [f for f in os.listdir(".") if f.endswith(".csv") and f != "base_limpa_1.csv"]

for file in csv_files:
    df = pd.read_csv(file)
    melhor = df.sort_values(by="F1-Score", ascending=False).iloc[0]
    model_name = melhor["Modelo"]  # pega o nome do modelo
    param_grid = ast.literal_eval(melhor["Melhores Params"])

    df_base = pd.read_csv("base_limpa_1.csv")
    X = df_base.iloc[:, :-1]
    y = df_base.iloc[:, -1] # pegando a ultima como target

    if y.dtype == "object":
        y = LabelEncoder().fit_transform(y)

    encoder_name = melhor["Encoder"]
    if encoder_name == "GetDummies":
        X_enc = pd.get_dummies(X, drop_first=True)
    else:
        enc = OrdinalEncoder() if encoder_name=="OrdinalEncoder" else OneHotEncoder(drop="first", sparse_output=False)
        X_enc = enc.fit_transform(X)
        if isinstance(X_enc, pd.DataFrame) is False and isinstance(X_enc, pd.Series) is False:
            X_enc = pd.DataFrame(X_enc)

    scaler_name = melhor["Scaler"]
    if scaler_name == "StandardScaler":
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()

    X_enc = scaler.fit_transform(X_enc)

    X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.3, random_state=42)

    modelo = modelos[model_name]
    modelo.set_params(**param_grid)
    modelo.fit(X_train, y_train)  

    nome_pkl = f"melhor_{model_name}.pkl"
    with open(nome_pkl, "wb") as f:
        pickle.dump(modelo, f)

    print(f"Modelo {model_name} treinado e salvo em {nome_pkl}")

# - Testes -

## 1. Carregando o dataset

In [None]:
df = pd.read_csv("base_limpa_1.csv")


X = df.iloc[:, :-1]

y = df.iloc[:, -1] #pegando a ultima como target

if y.dtype == "object":
    y = LabelEncoder().fit_transform(y)

df.head()

## 2. Definindo Encoders e Scalers

In [None]:
encoders = {
    "OrdinalEncoder": OrdinalEncoder(),
    "OneHotEncoder": OneHotEncoder(drop="first", sparse_output=False),
    "GetDummies": None,
}

scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler()
}

## 3. Definindo Modelos e Parâmetros

In [None]:
modelos = {
    "Decision Tree": (
        DecisionTreeClassifier(class_weight="balanced"),
        {
            "criterion": ["gini", "entropy"],
            "splitter": ["best", "random"],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": [None, "sqrt"],
        },
    ),
    "Random Forest": (
        RandomForestClassifier(class_weight="balanced"),
        {
            "n_estimators": [100, 200],
            "criterion": ["gini"],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt"],
            "bootstrap": [True],
        },
    ),
    "SVM": (
        SVC(),
        {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale"],
        },
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform"],
            "p": [2],
        },
    ),
    "MLP Neural Net": (
        MLPClassifier(max_iter=2000, early_stopping=True),
        {
            "hidden_layer_sizes": [(50,), (100,)],  
            "activation": ["relu", "tanh"],
            "solver": ["adam"],
            "alpha": [0.0001, 0.001],
            "learning_rate": ["constant", "adaptive"],
        },
    ),
}


### 3.1 Separação por modelo (para rodar um por vez) 

In [None]:
#Decision Tree
modelos = {
    "Decision Tree": (
        DecisionTreeClassifier(class_weight="balanced"),
        {
            "criterion": ["gini", "entropy"],
            "splitter": ["best", "random"],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": [None, "sqrt"],
        },
    ),
}

In [None]:
#Random Forest
modelos = {
    "Random Forest": (
        RandomForestClassifier(class_weight="balanced"),
        {
            "n_estimators": [100, 200],
            "criterion": ["gini"],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt"],
            "bootstrap": [True],
        },
    ),
}

In [None]:
#SVM
modelos = {
    "SVM": (
        SVC(),
        {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale"],
        },
    ),
}

In [None]:
#KNN
modelos = {
    "KNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform"],
            "p": [2],
        },
    ),
}

In [None]:
#MLP
modelos = {
    "MLP Neural Net": (
        MLPClassifier(max_iter=2000, early_stopping=True),
        {
            "hidden_layer_sizes": [(50,), (100,), (200,), (100,50)],
            "activation": ["relu", "tanh", "logistic"],
            "solver": ["adam", "lbfgs"],
            "alpha": [0.0001, 0.001],
            "learning_rate_init": [0.0001, 0.001, 0.01],
            "batch_size": [32, 64],
        }
    ),
}


## 4. Loop de Testes com GridSearchCV

In [None]:
resultados = []

for enc_name, encoder in encoders.items():
    if enc_name == "GetDummies":
        X_enc = pd.get_dummies(X, drop_first=True)
    else:
        X_enc = encoder.fit_transform(X)
        if isinstance(X_enc, np.ndarray):
            X_enc = pd.DataFrame(X_enc)

    for sc_name, scaler in scalers.items():
        X_train, X_test, y_train, y_test = train_test_split(
            X_enc, y, test_size=0.3, random_state=42
        )

        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model_name, (modelo, param_grid) in modelos.items():
            try:
                grid = GridSearchCV(
                    modelo, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1
                )
                grid.fit(X_train, y_train)
                y_pred = grid.predict(X_test)

                resultados.append({
                    "Encoder": enc_name,
                    "Scaler": sc_name,
                    "Modelo": model_name,
                    "Melhores Params": grid.best_params_,
                    "Accuracy": accuracy_score(y_test, y_pred),
                    "Precision": precision_score(y_test, y_pred, average="weighted"),
                    "Recall": recall_score(y_test, y_pred, average="weighted"),
                    "F1-Score": f1_score(y_test, y_pred, average="weighted"),
                })

            except Exception as e:
                resultados.append({
                    "Encoder": enc_name,
                    "Scaler": sc_name,
                    "Modelo": model_name,
                    "Melhores Params": None,
                    "Accuracy": None,
                    "Precision": None,
                    "Recall": None,
                    "F1-Score": None,
                    "Erro": str(e),
                })

## 5. Ranking Final dos Modelos

In [None]:
resultados_df = pd.DataFrame(resultados)
melhores = resultados_df.sort_values(by="F1-Score", ascending=False)

print("Ranking final:")
display(melhores)

In [None]:
salvar_melhores(melhores)