In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

In [2]:
def load(file_path):
    df = pd.read_csv(file_path)
    return df

In [3]:
def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    accuracy_train = accuracy_score(y_train, train_pred)
    accuracy_test = accuracy_score(y_test, test_pred)

    precision_train = precision_score(y_train, train_pred, average=None)
    precision_test = precision_score(y_test, test_pred, average=None)

    recall_train = recall_score(y_train, train_pred, average=None)
    recall_test = recall_score(y_test, test_pred, average=None)

    f1_train = f1_score(y_train, train_pred, average=None)
    f1_test = f1_score(y_test, test_pred, average=None)

    return {  
        "Accuracy_train": accuracy_train,
        "Accuracy_test": accuracy_test,
        "Precision_train": precision_train,
        "Precision_test": precision_test,
        "Recall_train": recall_train,
        "Recall_test": recall_test,
        "F1_train": f1_train,
        "F1_test": f1_test,
    }

In [4]:
def prepare_data(data, features, target, test_size=0.2, scaler=None, k_best=10):
    X = data[features]
    y = data[target]

    smote = SMOTE(random_state=101)
    X, y = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=101)

    if scaler:
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    selector = SelectKBest(score_func=f_classif, k=k_best)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    return X_train, X_test, y_train, y_test

In [11]:
scalers = {
    "StandardScaler": StandardScaler(),
    "RobustScaler": RobustScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "MaxAbsScaler": MaxAbsScaler()
}

models = {
    "Logistic": LogisticRegression(random_state=101),
    "SVC": SVC(random_state=101, max_iter=8000),
    "MLP": MLPClassifier(random_state=101, max_iter=2500),
    "GaussianNB": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "GradientBoosting": GradientBoostingClassifier(random_state=101),
    "RandomForest": RandomForestClassifier(random_state=101),
    "DecisionTree": DecisionTreeClassifier(random_state=101),
    "XGBoost": XGBClassifier(random_state=101)
}

In [12]:
file_path = os.path.join("C:/", "Users", "caioc", "Documents", "ProjetosData", "Obesidade", "data", "obesidade_encoded.csv")
if os.path.exists(file_path):
    print(f"O arquivo existe no caminho: {file_path}")

    data = load(file_path=file_path)

    features = ['Idade', 'Genero', 'Altura', 'Peso', 'Freq_Alcool',
       'Freq_Densidade_Calorica', 'Freq_Vegetais', 'Freq_Refeicoes',
       'Monitoramento_Calorias_Diarias', 'Fumante', 'Agua_Diaria',
       'Historico_Familiar', 'Atividade_Fisica', 'Freq_Tecnologia',
       'Alimento_Entre_Refeicoes_Frequentemente',
       'Alimento_Entre_Refeicoes_Negativo',
       'Alimento_Entre_Refeicoes_Ocasionalmente',
       'Alimento_Entre_Refeicoes_Sempre', 'Meio_Transporte_Automovel',
       'Meio_Transporte_Bicicleta', 'Meio_Transporte_Caminhada',
       'Meio_Transporte_Moto', 'Meio_Transporte_Transporte_Publico']
    target = 'Nivel_Obesidade'

    results = {}

    for scaler_name, scaler in scalers.items():
        print(f"\nUsando o Scaler: {scaler_name}")
        X_train, X_test, y_train, y_test = prepare_data(data, features, target, test_size=0.2, scaler=scaler, k_best=10)

        scaler_results = {}

        for model_name, model in models.items():
            print(f"\nTreinando: {model_name}...")
            model_results = train_evaluate_model(model, X_train, X_test, y_train, y_test)
            scaler_results[model_name] = model_results

        results[scaler_name] = scaler_results

    for scaler_name, scaler_result in results.items():
        print(f"\nResultados com {scaler_name}:")
        for model_name, metrics in scaler_result.items():
            print(f"\n{model_name}:")
            for metric_name, value in metrics.items():
                print(f"{metric_name}: {value}")

    example = X_test[[0]]
    for model_name, model in models.items():
        prediction = model.predict(example)
        actual = y_test.iloc[0]
        print(f"\n{model_name} - Exemplo de Previsão vs Realidade:")
        print(f"Previsão: {prediction[0]}")
        print(f"Realidade: {actual:.4f}")
else:
    print(f"O arquivo não foi encontrado no caminho: {file_path}")


O arquivo existe no caminho: C:/Users\caioc\Documents\ProjetosData\Obesidade\data\obesidade_encoded.csv

Usando o Scaler: StandardScaler

Treinando: Logistic...

Treinando: SVC...

Treinando: MLP...

Treinando: GaussianNB...

Treinando: KNN...

Treinando: GradientBoosting...

Treinando: RandomForest...

Treinando: DecisionTree...

Treinando: XGBoost...

Usando o Scaler: RobustScaler

Treinando: Logistic...

Treinando: SVC...

Treinando: MLP...

Treinando: GaussianNB...

Treinando: KNN...

Treinando: GradientBoosting...

Treinando: RandomForest...

Treinando: DecisionTree...

Treinando: XGBoost...

Usando o Scaler: MinMaxScaler

Treinando: Logistic...

Treinando: SVC...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Treinando: MLP...

Treinando: GaussianNB...

Treinando: KNN...

Treinando: GradientBoosting...

Treinando: RandomForest...

Treinando: DecisionTree...

Treinando: XGBoost...

Usando o Scaler: MaxAbsScaler

Treinando: Logistic...

Treinando: SVC...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Treinando: MLP...

Treinando: GaussianNB...

Treinando: KNN...

Treinando: GradientBoosting...

Treinando: RandomForest...

Treinando: DecisionTree...

Treinando: XGBoost...

Resultados com StandardScaler:

Logistic:
Accuracy_train: 0.9099236641221374
Accuracy_test: 0.8943089430894309
Precision_train: [0.95571956 0.94237288 0.98958333 0.90584416 0.91286307 0.80487805
 0.85818182]
Precision_test: [0.96296296 0.90909091 1.         0.91780822 0.9122807  0.75268817
 0.875     ]
Recall_train: [0.89619377 0.99285714 1.         0.98239437 0.78853047 0.85239852
 0.85198556]
Recall_test: [0.83870968 0.98591549 1.         1.         0.72222222 0.875
 0.85135135]
F1_train: [0.925      0.96695652 0.9947644  0.94256757 0.84615385 0.82795699
 0.85507246]
F1_test: [0.89655172 0.94594595 1.         0.95714286 0.80620155 0.80924855
 0.8630137 ]

SVC:
Accuracy_train: 0.9246819338422392
Accuracy_test: 0.8760162601626016
Precision_train: [0.96071429 0.98571429 0.99298246 0.94137931 0.83941606 0.86231884
