In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# ==============================================================================
# 1. CARREGAMENTO E PR√â-PROCESSAMENTO DOS DADOS
# ==============================================================================

print("=" * 80)
print("SISTEMA DE DIAGN√ìSTICO M√âDICO - STACKING OTIMIZADO")
print("=" * 80)

# Carregar dataset
print("\n[1/7] Carregando dataset...")
df = pd.read_csv('Final_Augmented_dataset_Diseases_and_Symptoms.csv')
# Remover doen√ßas raras (ocorr√™ncia √∫nica)
df = df[df['diseases'].isin(df['diseases'].value_counts()[df['diseases'].value_counts() > 1].index)]

print(f"   ‚úì Dataset: {df.shape[0]} amostras, {df.shape[1]} colunas")
print(f"   ‚úì Doen√ßas √∫nicas: {df['diseases'].nunique()}")
print(f"   ‚úì Sintomas: {df.shape[1] - 1}")

# Separar features e target
X = df.drop('diseases', axis=1)
y = df['diseases']

# Codificar labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# ==============================================================================
# 2. DIVIS√ÉO DOS DADOS
# ==============================================================================

print("\n[2/7] Dividindo dados...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"   ‚úì Treino: {X_train.shape[0]} | Teste: {X_test.shape[0]}")

# ==============================================================================
# 3. TREINAMENTO R√ÅPIDO DOS BASE LEARNERS (SEM CV)
# ==============================================================================

print("\n[3/7] Treinando Base Learner 1: Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=50,  # Reduzido de 100
    max_depth=15,     # Reduzido de 20
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rf_model.fit(X_train, y_train)
rf_acc = accuracy_score(y_test, rf_model.predict(X_test))
print(f"   ‚úì Random Forest treinado | Acur√°cia: {rf_acc:.4f}")

print("\n[4/7] Treinando Base Learner 2: Gradient Boosting...")
gb_model = GradientBoostingClassifier(
    n_estimators=50,  # Reduzido de 100
    max_depth=5,      # Reduzido de 10
    learning_rate=0.1,
    subsample=0.7,
    random_state=42,
    verbose=0
)
gb_model.fit(X_train, y_train)
gb_acc = accuracy_score(y_test, gb_model.predict(X_test))
print(f"   ‚úì Gradient Boosting treinado | Acur√°cia: {gb_acc:.4f}")

print("\n[5/7] Treinando Base Learner 3: Naive Bayes...")
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_acc = accuracy_score(y_test, nb_model.predict(X_test))
print(f"   ‚úì Naive Bayes treinado | Acur√°cia: {nb_acc:.4f}")

# ==============================================================================
# 4. CRIAR FEATURES PARA META-LEARNER (STACKING MANUAL)
# ==============================================================================

print("\n[6/7] Criando features para Meta-Learner (Stacking)...")

# Predi√ß√µes dos base learners no conjunto de treino (usando probabilidades)
rf_proba_train = rf_model.predict_proba(X_train)
gb_proba_train = gb_model.predict_proba(X_train)
nb_proba_train = nb_model.predict_proba(X_train)

# Concatenar predi√ß√µes como novas features
X_train_meta = np.hstack([rf_proba_train, gb_proba_train, nb_proba_train])

# Mesmo processo para conjunto de teste
rf_proba_test = rf_model.predict_proba(X_test)
gb_proba_test = gb_model.predict_proba(X_test)
nb_proba_test = nb_model.predict_proba(X_test)

X_test_meta = np.hstack([rf_proba_test, gb_proba_test, nb_proba_test])

print(f"   ‚úì Features meta criadas: {X_train_meta.shape[1]} colunas")

# Treinar meta-learner
print("\n   Treinando Meta-Learner (Logistic Regression)...")
meta_learner = LogisticRegression(
    max_iter=500,  # Reduzido de 1000
    random_state=42,
    n_jobs=-1,
    solver='lbfgs',
    verbose=0
)
meta_learner.fit(X_train_meta, y_train)
print("   ‚úì Meta-Learner treinado!")

# ==============================================================================
# 5. AVALIA√á√ÉO DO MODELO STACKING
# ==============================================================================

print("\n[7/7] Avaliando Stacking Ensemble...")

# Predi√ß√µes finais
y_pred_train = meta_learner.predict(X_train_meta)
y_pred_test = meta_learner.predict(X_test_meta)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"\n   ‚úì Acur√°cia TREINO: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"   ‚úì Acur√°cia TESTE:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Relat√≥rio resumido
report = classification_report(y_test, y_pred_test, output_dict=True, zero_division=0)
print(f"\n   M√©tricas M√©dias (Weighted):")
print(f"   - Precis√£o:  {report['weighted avg']['precision']:.4f}")
print(f"   - Recall:    {report['weighted avg']['recall']:.4f}")
print(f"   - F1-Score:  {report['weighted avg']['f1-score']:.4f}")

# ==============================================================================
# 6. SALVAR MODELOS
# ==============================================================================

print("\n" + "=" * 80)
print("SALVANDO MODELOS...")
print("=" * 80)

# Salvar todos os modelos
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(gb_model, 'gb_model.pkl')
joblib.dump(nb_model, 'nb_model.pkl')
joblib.dump(meta_learner, 'meta_learner.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# Salvar feature names
feature_names = X.columns.tolist()
with open('feature_names.json', 'w') as f:
    json.dump(feature_names, f)

print("   ‚úì Modelos salvos:")
print("      - rf_model.pkl")
print("      - gb_model.pkl")
print("      - nb_model.pkl")
print("      - meta_learner.pkl")
print("      - label_encoder.pkl")
print("      - feature_names.json")

# ==============================================================================
# 7. FUN√á√ÉO DE PREDI√á√ÉO COM JSON
# ==============================================================================

def predict_disease_from_json(json_input):
    """
    Prediz doen√ßa a partir de JSON de sintomas.
    Sintomas n√£o fornecidos s√£o considerados 0 (falso).
    
    Args:
        json_input (dict ou str): Sintomas {nome: valor}
    
    Returns:
        dict: Predi√ß√£o, confian√ßa e top 5 diagn√≥sticos
    """
    # Carregar modelos
    rf = joblib.load('rf_model.pkl')
    gb = joblib.load('gb_model.pkl')
    nb = joblib.load('nb_model.pkl')
    meta = joblib.load('meta_learner.pkl')
    encoder = joblib.load('label_encoder.pkl')
    
    with open('feature_names.json', 'r') as f:
        features = json.load(f)
    
    # Parse JSON
    if isinstance(json_input, str):
        json_input = json.loads(json_input)
    
    # Criar vetor de features (default = 0)
    feature_vector = np.zeros(len(features))
    
    for symptom, value in json_input.items():
        if symptom in features:
            idx = features.index(symptom)
            feature_vector[idx] = int(value)
    
    feature_vector = feature_vector.reshape(1, -1)
    
    # Predi√ß√µes dos base learners
    rf_proba = rf.predict_proba(feature_vector)
    gb_proba = gb.predict_proba(feature_vector)
    nb_proba = nb.predict_proba(feature_vector)
    
    # Concatenar para meta-learner
    meta_features = np.hstack([rf_proba, gb_proba, nb_proba])
    
    # Predi√ß√£o final
    prediction = meta.predict(meta_features)[0]
    probabilities = meta.predict_proba(meta_features)[0]
    
    disease = encoder.inverse_transform([prediction])[0]
    
    # Top 5 doen√ßas
    top_5_indices = np.argsort(probabilities)[-5:][::-1]
    top_5_diseases = [
        {
            'disease': encoder.inverse_transform([idx])[0],
            'probability': float(probabilities[idx])
        }
        for idx in top_5_indices
    ]
    
    return {
        'predicted_disease': disease,
        'confidence': float(probabilities[prediction]),
        'top_5_predictions': top_5_diseases
    }

print("\n‚úÖ Fun√ß√£o criada: predict_disease_from_json()")

# ==============================================================================
# 8. EXEMPLO DE USO
# ==============================================================================

print("\n" + "=" * 80)
print("EXEMPLO DE PREDI√á√ÉO")
print("=" * 80)

example_json = {
    "fever": 1,
    "cough": 1,
    "fatigue": 1,
    "headache": 1,
    "shortness of breath": 1
}

print("\nSintomas de entrada:")
print(json.dumps(example_json, indent=2))

result = predict_disease_from_json(example_json)

print("\n" + "-" * 80)
print("RESULTADO:")
print("-" * 80)
print(f"\nüè• Doen√ßa Prevista: {result['predicted_disease']}")
print(f"üìä Confian√ßa: {result['confidence']:.4f} ({result['confidence']*100:.2f}%)")

print("\nüìã Top 5 Diagn√≥sticos:")
for i, pred in enumerate(result['top_5_predictions'], 1):
    print(f"   {i}. {pred['disease']}: {pred['probability']*100:.2f}%")

print("\n" + "=" * 80)
print("‚úÖ SISTEMA PRONTO!")
print("=" * 80)
print("\nüí° Para usar:")
print("   resultado = predict_disease_from_json({'febre': 1, 'tosse': 1})")
print("\n‚ö° Otimizado para velocidade e efici√™ncia de mem√≥ria!")
print("=" * 80)

SISTEMA DE DIAGN√ìSTICO M√âDICO - STACKING OTIMIZADO

[1/7] Carregando dataset...
   ‚úì Dataset: 246926 amostras, 378 colunas
   ‚úì Doen√ßas √∫nicas: 754
   ‚úì Sintomas: 377

[2/7] Dividindo dados...
   ‚úì Treino: 197540 | Teste: 49386

[3/7] Treinando Base Learner 1: Random Forest...
   ‚úì Random Forest treinado | Acur√°cia: 0.5166

[4/7] Treinando Base Learner 2: Gradient Boosting...
