In [8]:
from flask import Flask, request, jsonify
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import os
import numpy as np

# ------------------------------
# Load and clean data
# ------------------------------
df = pd.read_csv("Diseases_Symptoms.csv")  # Columns: Code, Name, Symptoms, Treatments

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9, ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df = df.dropna(subset=['Symptoms', 'Treatments', 'Name'])
df['Symptoms'] = df['Symptoms'].apply(clean_text)
df['Treatments'] = df['Treatments'].apply(clean_text)
df['Name'] = df['Name'].apply(clean_text)

# ------------------------------
# Grouping diseases into categories
# ------------------------------
category_map = {
    'heart': 'Cardiac',
    'attack': 'Cardiac',
    'asthma': 'Respiratory',
    'covid': 'Infectious',
    'flu': 'Infectious',
    'hiv': 'Infectious',
    'aids': 'Infectious',
    'cancer': 'Oncology',
    'tumor': 'Oncology',
    'stroke': 'Neurological',
    'seizure': 'Neurological',
    'diabetes': 'Metabolic',
    'arthritis': 'Autoimmune',
    'tb': 'Infectious',
    'pneumonia': 'Respiratory',
    'depression': 'Psychiatric',
    'anxiety': 'Psychiatric',
    'malaria': 'Infectious',
    'jaundice': 'Hepatic'
}

def map_to_category(disease_name):
    for keyword, category in category_map.items():
        if keyword in disease_name:
            return category
    return "Other"

df['Disease_Category'] = df['Name'].apply(map_to_category)

# ------------------------------
# Risk labeling
# ------------------------------
def classify_risk(symptoms):
    high_keywords = [
        'chest pain', 'seizure', 'stroke', 'unconscious',
        'shortness of breath', 'palpitation', 'bleeding', 'confusion',
        'heart attack', 'cancer', 'hiv', 'aids', 'tumor', 'malignancy'
    ]
    symptoms = symptoms.lower()
    for kw in high_keywords:
        if kw in symptoms:
            return 'HIGH'
    return 'LOW'

df['Risk'] = df['Symptoms'].apply(classify_risk)
df['Risk_Label'] = df['Risk'].map({'LOW': 0, 'HIGH': 1})

# ------------------------------
# Training function
# ------------------------------
def train_best_model(X, y, label=""):
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "DecisionTree": DecisionTreeClassifier(random_state=42),
        "KNN": KNeighborsClassifier(),
        "NaiveBayes": MultinomialNB(),
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "SVM": SVC(kernel='linear', probability=True),
        "GradientBoosting": GradientBoostingClassifier(random_state=42),
        "ExtraTrees": ExtraTreesClassifier(random_state=42)
    }

    best_model = None
    best_score = 0
    best_model_name = ""

    print(f"\n🔍 Evaluating models for {label}...")
    for name, model in models.items():
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', model)
        ])
        try:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            print(f"{label} - {name}: {score:.4f}")
            if score > best_score:
                best_score = score
                best_model = pipeline
                best_model_name = name
        except Exception as e:
            print(f"{name} failed: {e}")
    
    print(f"✅ Best {label} Model: {best_model_name} ({best_score:.4f})")
    return best_model

# ------------------------------
# Train Risk Model
# ------------------------------
risk_model = train_best_model(df['Symptoms'], df['Risk_Label'], label="Risk")
joblib.dump(risk_model, "triage_model.pkl")

# ------------------------------
# Train Disease Category Model
# ------------------------------
disease_model = train_best_model(df['Symptoms'], df['Disease_Category'], label="Disease")
joblib.dump(disease_model, "disease_model.pkl")

# ------------------------------
# Flask API
# ------------------------------
app = Flask(__name__)
risk_model = joblib.load("triage_model.pkl")
disease_model = joblib.load("disease_model.pkl")
disease_classes = disease_model.classes_

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json()
    symptoms = data.get("symptoms", "")
    if not symptoms:
        return jsonify({"error": "No symptoms provided"}), 400

    clean_input = clean_text(symptoms)
    
    risk_pred = risk_model.predict([clean_input])[0]
    
    disease_probs = disease_model.predict_proba([clean_input])[0]
    top_indices = np.argsort(disease_probs)[::-1][:3]
    top_diseases = [
        {"category": disease_classes[i], "probability": round(disease_probs[i], 4)}
        for i in top_indices
    ]

    match = df[df['Symptoms'].str.contains(clean_input.split()[0], na=False)]
    match_row = match.iloc[0] if not match.empty else df.sample(1).iloc[0]

    return jsonify({
        "input_symptoms": symptoms,
        "predicted_risk": "HIGH" if risk_pred else "LOW",
        "top_disease_categories": top_diseases,
        "treatment": match_row['Treatments'] if risk_pred == 0 else None,
        "recommend_doctor": "Consult Doctor" if risk_pred == 1 else None
    })

if __name__ == '__main__':
    port = int(os.environ.get("PORT", 5000))
    app.run(debug=True, host='0.0.0.0', port=port)



🔍 Evaluating models for Risk...
Risk - RandomForest: 0.9750
Risk - DecisionTree: 0.9875
KNN failed: 'NoneType' object has no attribute 'split'
Risk - NaiveBayes: 0.9250
Risk - LogisticRegression: 0.9250
Risk - SVM: 0.9875
Risk - GradientBoosting: 0.9750
Risk - ExtraTrees: 0.9875
✅ Best Risk Model: DecisionTree (0.9875)

🔍 Evaluating models for Disease...
Disease - RandomForest: 0.9125
Disease - DecisionTree: 0.8875
KNN failed: 'NoneType' object has no attribute 'split'
Disease - NaiveBayes: 0.9125
Disease - LogisticRegression: 0.9125
Disease - SVM: 0.9125
Disease - GradientBoosting: 0.8750
Disease - ExtraTrees: 0.9125
✅ Best Disease Model: RandomForest (0.9125)
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.133:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
