In [78]:
import pandas as pd
import wandb
import params
from pycaret import *

def _create_table(file_path: str, class_labels):
    """
    Cria uma tabela com os parâmetros especificados e adiciona os dados do arquivo CSV.
    """
    # Colunas especificadas
    columns = [str(class_labels[_lab]) for _lab in list(class_labels)]
    table = wandb.Table(columns=["id"] + columns)

    # Ler os dados do arquivo CSV
    data = pd.read_csv(file_path)

    # Adicionar os dados na tabela
    for idx, row in data.iterrows():
      row_id = f"sample_{idx}"
      table.add_data(row_id, *[row[col] for col in columns])

    return table

def save_table_wb(file_path, nome):
    run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="upload", config=params.CONFIG)
    
    raw_data_at = wandb.Artifact(name=params.RAW_DATA_AT, type="raw_data")
    
    table = _create_table(file_path, params.BDD_CLASSES)
    
    raw_data_at.add(table, nome)
    
    run.log_artifact(raw_data_at)
    run.finish()

save_table_wb('Data/diabetes_prediction_dataset.csv', 'diabetes_prediction__table')

In [79]:
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd


def load_df():
    df = pd.read_csv('Data/diabetes_prediction_dataset.csv')
    return df

def balance_df():
    df = load_df()
    class_majority = df[df['diabetes'] == 0]
    class_minority = df[df['diabetes'] == 1]
    
    class_majority_downsampled = resample(class_majority,
                                          replace=False,
                                          n_samples=len(class_minority),
                                          random_state=42)
    
    df = pd.concat([class_majority_downsampled, class_minority])
    df.to_csv('Data/dataset_balanceado.csv', index=False)
    save_table_wb('Data/dataset_balanceado.csv', "diabetes_prediction__table_balanceada")
    return df


In [80]:
import great_expectations as ge

def set_expectations(df):
    diabetes_expectation_suite = ge.core.ExpectationSuite(
        expectation_suite_name="diabetes_expectation_suite"
    )
    
    diabetes_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column": "blood_glucose_level",
                "min_value": 69,
                "max_value": 500,
                "strict_min": True
            }
        )
    )
    
    diabetes_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={
                "column": "bmi",
                "min_value": 9,
                "max_value": 150
            }
        )
    )
    
    diabetes_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "hypertension",
                "value_set": [0, 1]
            }
        )
    )
    
    data_asset = ge.from_pandas(df)
    validation_results = data_asset.validate(expectation_suite=diabetes_expectation_suite)
    
    assert validation_results.success, "Interrompendo execução: validação falhou."


In [91]:
from pycaret.classification import *
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import wandb

def upload_metrics_to_wandb(accuracy, recall, precision, f1):
    metrics_table = wandb.Table(columns=["Metric", "Value"])
    metrics_table.add_data("Accuracy", accuracy)
    metrics_table.add_data("Recall", recall)
    metrics_table.add_data("Precision", precision)
    metrics_table.add_data("F1-Score", f1)
    wandb.log({"Model Metrics": metrics_table})

def find_model():
    run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="upload", config=params.CONFIG)
    
    exp = ClassificationExperiment()
    exp.setup(df, target='diabetes', session_id=123)
    
    best = exp.compare_models()
    best_tuned = exp.tune_model(best)
    print(best_tuned)

    exp.plot_model(best, plot='feature', save=True)
    wandb.log({"feature_importance": wandb.Image('Feature Importance.png')})
    
    final_model = exp.finalize_model(best_tuned)
    exp.save_model(final_model, 'Model/diabetes_model')
    
    evaluation_results = exp.evaluate_model(final_model)
    
    filename = "confusion_matrix.png"
    exp.plot_model(final_model, plot="confusion_matrix", save=True)
    wandb.log({"Matriz de Confusão": wandb.Image('Confusion Matrix.png')})
    
    predictions = exp.predict_model(final_model)
    
    y_true = predictions['diabetes'] 
    y_pred = predictions['prediction_label'] 
    
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    upload_metrics_to_wandb(accuracy, recall, precision, f1)

    with open('Results/metrics.txt', 'w') as f:
        f.write(f"Acuracia do modelo: {round(accuracy, 3)}\n")
        f.write(f"Recall: {round(recall, 3)}\n")
        f.write(f"Precisão: {round(precision, 3)}\n")
        f.write(f"F1-Score: {round(f1, 3)}\n")
    
    model_filename = 'Model/diabetes_model.pkl'
    artifact = wandb.Artifact('diabetes_model', type='model')
    artifact.add_file(model_filename)
    wandb.log_artifact(artifact)
    
    wandb.finish()



In [92]:
def build_model():
    df = balance_df()
    set_expectations(df)
    find_model()

build_model()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,diabetes
2,Target type,Binary
3,Original data shape,"(17000, 9)"
4,Transformed data shape,"(17000, 16)"
5,Transformed train set shape,"(11900, 16)"
6,Transformed test set shape,"(5100, 16)"
7,Numeric features,6
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9134,0.979,0.9173,0.9104,0.9137,0.8267,0.827,0.052
gbc,Gradient Boosting Classifier,0.9129,0.9796,0.9225,0.9054,0.9138,0.8259,0.8263,0.119
lightgbm,Light Gradient Boosting Machine,0.9097,0.9779,0.9161,0.9047,0.9103,0.8195,0.8197,0.133
rf,Random Forest Classifier,0.9013,0.9722,0.904,0.8993,0.9016,0.8027,0.8028,0.097
et,Extra Trees Classifier,0.8952,0.9643,0.8982,0.8931,0.8955,0.7904,0.7906,0.095
lda,Linear Discriminant Analysis,0.8862,0.9617,0.8771,0.8935,0.8851,0.7724,0.7729,0.016
ridge,Ridge Classifier,0.8861,0.9617,0.877,0.8935,0.885,0.7723,0.7727,0.014
lr,Logistic Regression,0.8851,0.9629,0.8834,0.8866,0.8848,0.7703,0.7706,0.113
dt,Decision Tree Classifier,0.8773,0.8774,0.8768,0.8779,0.8772,0.7546,0.7548,0.016
svm,SVM - Linear Kernel,0.876,0.9612,0.8467,0.9029,0.8712,0.7519,0.7571,0.025


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9059,0.9771,0.921,0.894,0.9073,0.8118,0.8121
1,0.9042,0.9776,0.8941,0.9125,0.9032,0.8084,0.8086
2,0.9101,0.9788,0.9378,0.8885,0.9125,0.8202,0.8214
3,0.9008,0.9751,0.9193,0.8865,0.9026,0.8017,0.8022
4,0.9143,0.9795,0.9277,0.9034,0.9154,0.8286,0.8289
5,0.9034,0.9729,0.9345,0.8797,0.9063,0.8067,0.8083
6,0.8882,0.9732,0.9059,0.875,0.8902,0.7765,0.777
7,0.8983,0.9716,0.9008,0.8963,0.8986,0.7966,0.7966
8,0.9109,0.979,0.9597,0.8744,0.9151,0.8218,0.8258
9,0.9092,0.9771,0.916,0.9038,0.9098,0.8185,0.8186


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
AdaBoostClassifier(algorithm='SAMME.R', estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=123)


Transformation Pipeline and Model Successfully Saved


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.9125,0.9802,0.9137,0.9116,0.9127,0.8251,0.8251


VBox(children=(Label(value='0.060 MB of 0.060 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [83]:
!pytest Tests/test_diabetes_classification.py

platform win32 -- Python 3.11.9, pytest-8.3.4, pluggy-1.5.0
rootdir: C:\Users\abia0\OneDrive\Área de Trabalho\Projetos\Faculdade\topicos_especiais\Diabetes-Classification\Diabetes-Prediction
plugins: anyio-4.6.2.post1, dash-2.18.2
collected 6 items

Tests\test_diabetes_classification.py ......                             [100%]

