In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    f1_score, precision_score, recall_score, balanced_accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

<h1>Leitura e Importação Dataset</h1>

In [2]:
# Caminho do dataset
DATA_PATH = 'Data/Final_Augmented_dataset_Diseases_and_Symptoms.csv'

df = pd.read_csv(DATA_PATH)

<h1>Tratamento Dataset</h1>

In [3]:
# Identificação da coluna-alvo
target_col = 'diseases' if 'diseases' in df.columns else df.columns[0]
y = df[target_col].astype(str).fillna('Unknown')

# Selecionar apenas sintomas binários (0/1)
feature_cols = [c for c in df.columns if c != target_col]
binary_cols = [
    c for c in feature_cols
    if set(df[c].dropna().unique()).issubset({0,1}) or set(df[c].dropna().unique()).issubset({0.0,1.0})
]

<h1>Treinamento com as Doenças Mais Frequentes</h1>

In [4]:
# Manter apenas as doenças mais frequentes
max_classes = 50
top_classes = y.value_counts().nlargest(max_classes).index
df = df[df[target_col].isin(top_classes)].copy()

# Preparar X e y
X = df[binary_cols].fillna(0).astype(int)
y = df[target_col].astype(str)

# Reduzir amostra (balanceada)
max_samples = 16000
if len(X) > max_samples:
    df_sample = df.groupby(target_col, group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // max_classes), random_state=42)
    )
    X = df_sample[binary_cols].fillna(0).astype(int)
    y = df_sample[target_col].astype(str)

# Separar treino/teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Treinamento do modelo
model = RandomForestClassifier(
    n_estimators=80,
    max_depth=18,
    min_samples_split=3,
    min_samples_leaf=2,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

model.fit(X_train, y_train)

  df_sample = df.groupby(target_col, group_keys=False).apply(


<h1>Exibição Métricas</h1>

In [5]:
# Avaliação
preds = model.predict(X_test)

metrics = {
    'accuracy': accuracy_score(y_test, preds),
    'balanced_accuracy': balanced_accuracy_score(y_test, preds),
    'precision_macro': precision_score(y_test, preds, average='macro', zero_division=0),
    'recall_macro': recall_score(y_test, preds, average='macro', zero_division=0),
    'f1_macro': f1_score(y_test, preds, average='macro', zero_division=0)
}

print("\n📈 MÉTRICAS GERAIS DO MODELO:")
for k, v in metrics.items():
    print(f"  - {k}: {v:.4f}")

used_diseases = list(model.classes_)

# Importância dos Sintomas
feature_importances = pd.DataFrame({
    'Symptom': list(X.columns),
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)


📈 MÉTRICAS GERAIS DO MODELO:
  - accuracy: 0.9209
  - balanced_accuracy: 0.9209
  - precision_macro: 0.9272
  - recall_macro: 0.9209
  - f1_macro: 0.9213


<h1>Salvar Modelo Treinado e Dados</h1>

In [6]:
MODEL_PATH = 'Model/modelo_diagnostico.joblib'
DISEASES_PATH = 'Data/doencas_previstas.csv'
IMPORTANCE_PATH = 'Data/importancia_sintomas.csv'

joblib.dump(model, MODEL_PATH)
pd.DataFrame({'Disease': used_diseases}).to_csv(DISEASES_PATH, index=False)
feature_importances.to_csv(IMPORTANCE_PATH, index=False)