In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import joblib
import json
import warnings
import time
warnings.filterwarnings('ignore')

# ==============================================================================
# 1. CARREGAMENTO E PRÉ-PROCESSAMENTO
# ==============================================================================

print("=" * 80)
print("SISTEMA DE DIAGNÓSTICO MÉDICO - STACKING ULTRA-RÁPIDO")
print("=" * 80)

start_time = time.time()

print("\n[1/7] Carregando dataset...")
df = pd.read_csv('Final_Augmented_dataset_Diseases_and_Symptoms.csv')
# Remover doenças raras (ocorrência única)
df = df[df['diseases'].isin(df['diseases'].value_counts()[df['diseases'].value_counts() > 1].index)]

print(f"   ✓ Dataset: {df.shape[0]} amostras, {df.shape[1]} colunas")
print(f"   ✓ Doenças: {df['diseases'].nunique()} | Sintomas: {df.shape[1] - 1}")

# Separar features e target
X = df.drop('diseases', axis=1).values.astype(np.int8)  # Otimização de memória
y = df['diseases'].values
feature_names = df.drop('diseases', axis=1).columns.tolist()

# Codificar labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# ==============================================================================
# 2. DIVISÃO DOS DADOS
# ==============================================================================

print("\n[2/7] Dividindo dados...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"   ✓ Treino: {X_train.shape[0]:,} | Teste: {X_test.shape[0]:,}")

# ==============================================================================
# 3. BASE LEARNER 1: RANDOM FOREST (PARALELO)
# ==============================================================================

print("\n[3/7] Treinando Base Learner 1: Random Forest...")
t1 = time.time()

rf_model = RandomForestClassifier(
    n_estimators=30,      # Reduzido de 50
    max_depth=12,         # Reduzido de 15
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    max_samples=0.7,      # Usa apenas 70% dos dados por árvore
    random_state=42,
    n_jobs=-1,
    verbose=0,
    warm_start=False
)
rf_model.fit(X_train, y_train)
rf_acc = accuracy_score(y_test, rf_model.predict(X_test))

print(f"   ✓ Concluído em {time.time()-t1:.1f}s | Acurácia: {rf_acc:.4f} ({rf_acc*100:.2f}%)")

# ==============================================================================
# 4. BASE LEARNER 2: EXTRA TREES (MUITO MAIS RÁPIDO QUE GB)
# ==============================================================================

print("\n[4/7] Treinando Base Learner 2: Extra Trees...")
t2 = time.time()

et_model = ExtraTreesClassifier(
    n_estimators=30,      # Extra Trees é mais rápido que Random Forest
    max_depth=12,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    max_samples=0.7,
    random_state=42,
    n_jobs=-1,
    verbose=0,
    bootstrap=True
)
et_model.fit(X_train, y_train)
et_acc = accuracy_score(y_test, et_model.predict(X_test))

print(f"   ✓ Concluído em {time.time()-t2:.1f}s | Acurácia: {et_acc:.4f} ({et_acc*100:.2f}%)")

# ==============================================================================
# 5. BASE LEARNER 3: NAIVE BAYES (INSTANTÂNEO)
# ==============================================================================

print("\n[5/7] Treinando Base Learner 3: Naive Bayes...")
t3 = time.time()

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_acc = accuracy_score(y_test, nb_model.predict(X_test))

print(f"   ✓ Concluído em {time.time()-t3:.1f}s | Acurácia: {nb_acc:.4f} ({nb_acc*100:.2f}%)")

# ==============================================================================
# 6. STACKING: META-LEARNER
# ==============================================================================

print("\n[6/7] Criando Stacking Ensemble...")
t4 = time.time()

# Predições dos base learners (probabilidades)
print("   → Gerando predições dos base learners...")
rf_proba_train = rf_model.predict_proba(X_train)
et_proba_train = et_model.predict_proba(X_train)
nb_proba_train = nb_model.predict_proba(X_train)

rf_proba_test = rf_model.predict_proba(X_test)
et_proba_test = et_model.predict_proba(X_test)
nb_proba_test = nb_model.predict_proba(X_test)

# Concatenar como features para meta-learner
X_train_meta = np.hstack([rf_proba_train, et_proba_train, nb_proba_train])
X_test_meta = np.hstack([rf_proba_test, et_proba_test, nb_proba_test])

print(f"   → Features meta: {X_train_meta.shape[1]:,} colunas")

# Treinar meta-learner
print("   → Treinando Meta-Learner (Logistic Regression)...")
meta_learner = LogisticRegression(
    max_iter=300,
    random_state=42,
    n_jobs=-1,
    solver='lbfgs',
    verbose=0
)
meta_learner.fit(X_train_meta, y_train)

print(f"   ✓ Stacking concluído em {time.time()-t4:.1f}s")

# ==============================================================================
# 7. AVALIAÇÃO FINAL
# ==============================================================================

print("\n[7/7] Avaliando Stacking Ensemble...")

y_pred_train = meta_learner.predict(X_train_meta)
y_pred_test = meta_learner.predict(X_test_meta)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"\n   🎯 ACURÁCIA TREINO: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"   🎯 ACURÁCIA TESTE:  {test_acc:.4f} ({test_acc*100:.2f}%)")

# Métricas detalhadas
report = classification_report(y_test, y_pred_test, output_dict=True, zero_division=0)
print(f"\n   📊 Métricas Médias (Weighted):")
print(f"      Precisão:  {report['weighted avg']['precision']:.4f}")
print(f"      Recall:    {report['weighted avg']['recall']:.4f}")
print(f"      F1-Score:  {report['weighted avg']['f1-score']:.4f}")

total_time = time.time() - start_time
print(f"\n   ⏱️  Tempo total de treinamento: {total_time:.1f}s ({total_time/60:.1f} min)")

# ==============================================================================
# 8. SALVAR MODELOS
# ==============================================================================

print("\n" + "=" * 80)
print("SALVANDO MODELOS...")
print("=" * 80)

joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(et_model, 'et_model.pkl')
joblib.dump(nb_model, 'nb_model.pkl')
joblib.dump(meta_learner, 'meta_learner.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

with open('feature_names.json', 'w') as f:
    json.dump(feature_names, f)

print("   ✓ Todos os modelos salvos!")

# ==============================================================================
# 9. FUNÇÃO DE PREDIÇÃO
# ==============================================================================

def predict_disease_from_json(json_input):
    """
    Prediz doença a partir de JSON de sintomas.
    Sintomas não fornecidos = 0 (falso).
    
    Args:
        json_input: dict ou str com sintomas
    
    Returns:
        dict com predição, confiança e top 5
    """
    # Carregar modelos
    rf = joblib.load('rf_model.pkl')
    et = joblib.load('et_model.pkl')
    nb = joblib.load('nb_model.pkl')
    meta = joblib.load('meta_learner.pkl')
    encoder = joblib.load('label_encoder.pkl')
    
    with open('feature_names.json', 'r') as f:
        features = json.load(f)
    
    # Parse JSON
    if isinstance(json_input, str):
        json_input = json.loads(json_input)
    
    # Criar vetor (default = 0)
    feature_vector = np.zeros(len(features), dtype=np.int8)
    
    for symptom, value in json_input.items():
        if symptom in features:
            idx = features.index(symptom)
            feature_vector[idx] = int(value)
    
    feature_vector = feature_vector.reshape(1, -1)
    
    # Predições base learners
    rf_proba = rf.predict_proba(feature_vector)
    et_proba = et.predict_proba(feature_vector)
    nb_proba = nb.predict_proba(feature_vector)
    
    # Meta features
    meta_features = np.hstack([rf_proba, et_proba, nb_proba])
    
    # Predição final
    prediction = meta.predict(meta_features)[0]
    probabilities = meta.predict_proba(meta_features)[0]
    
    disease = encoder.inverse_transform([prediction])[0]
    
    # Top 5
    top_5_indices = np.argsort(probabilities)[-5:][::-1]
    top_5 = [
        {
            'disease': encoder.inverse_transform([idx])[0],
            'probability': float(probabilities[idx])
        }
        for idx in top_5_indices
    ]
    
    return {
        'predicted_disease': disease,
        'confidence': float(probabilities[prediction]),
        'top_5_predictions': top_5
    }

print("\n✅ Função criada: predict_disease_from_json()")

# ==============================================================================
# 10. EXEMPLO
# ==============================================================================

print("\n" + "=" * 80)
print("TESTE DE PREDIÇÃO")
print("=" * 80)

example = {
    "fever": 1,
    "cough": 1,
    "fatigue": 1,
    "headache": 1,
    "shortness of breath": 1
}

print("\n📝 Sintomas:")
for s, v in example.items():
    print(f"   • {s}: {v}")

result = predict_disease_from_json(example)

print("\n" + "-" * 80)
print("🏥 RESULTADO:")
print("-" * 80)
print(f"\nDoença Prevista: {result['predicted_disease']}")
print(f"Confiança: {result['confidence']*100:.2f}%")

print("\n📋 Top 5 Diagnósticos Mais Prováveis:")
for i, p in enumerate(result['top_5_predictions'], 1):
    bar = "█" * int(p['probability'] * 50)
    print(f"{i}. {p['disease']}")
    print(f"   {bar} {p['probability']*100:.2f}%")

print("\n" + "=" * 80)
print("✅ SISTEMA PRONTO PARA USO!")
print("=" * 80)
print("\n💡 Uso:")
print("   resultado = predict_disease_from_json({'fever': 1, 'cough': 1})")
print("   resultado = predict_disease_from_json({})  # Todos sintomas = 0")
print("\n⚡ Modelos:")
print("   • Random Forest (30 árvores)")
print("   • Extra Trees (30 árvores) - Substitui Gradient Boosting")
print("   • Naive Bayes")
print("   • Meta-Learner: Logistic Regression")
print("=" * 80)

SISTEMA DE DIAGNÓSTICO MÉDICO - STACKING ULTRA-RÁPIDO

[1/7] Carregando dataset...
   ✓ Dataset: 246926 amostras, 378 colunas
   ✓ Doenças: 754 | Sintomas: 377

[2/7] Dividindo dados...
   ✓ Treino: 197,540 | Teste: 49,386

[3/7] Treinando Base Learner 1: Random Forest...
   ✓ Concluído em 19.5s | Acurácia: 0.3916 (39.16%)

[4/7] Treinando Base Learner 2: Extra Trees...
   ✓ Concluído em 18.5s | Acurácia: 0.3912 (39.12%)

[5/7] Treinando Base Learner 3: Naive Bayes...
   ✓ Concluído em 256.6s | Acurácia: 0.8667 (86.67%)

[6/7] Criando Stacking Ensemble...
   → Gerando predições dos base learners...
   → Features meta: 2,262 colunas
   → Treinando Meta-Learner (Logistic Regression)...
