In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
import joblib

# Load dataset
df = pd.read_csv('disease_symptom_medicine_dataset.csv')
X = df['symptoms']
y = df['disease']

# Use named tokenizer function
def pipe_tokenizer(symptom_str):
    return symptom_str.split('|')

# Create pipeline
pipeline = make_pipeline(
    CountVectorizer(tokenizer=pipe_tokenizer),
    RandomForestClassifier(n_estimators=100, random_state=42)
)

# Train model
pipeline.fit(X, y)

# Save pipeline
joblib.dump(pipeline, 'model.pkl')  #for serializing and deserializing Python object
print(" Model trained and saved as model.pkl")


 Model trained and saved as model.pkl




In [None]:
#Building Model
import joblib

model = joblib.load('model.pkl')
symptoms = "fever|cough|guilt"
print("Predicted:", model.predict([symptoms])[0])


Predicted: COVID-19


In [None]:
# Phase 3: Load Medical Report → Extract Symptoms → Predict Disease → Recommend Treatment

import pandas as pd
import joblib

# Load trained model
model = joblib.load("model.pkl")

# Load disease-medicine dataset
df = pd.read_csv("disease_symptom_medicine_dataset.csv")

# Define known symptom list
SYMPTOM_LIST = [
    "fever", "cough", "fatigue", "headache", "nausea",
    "blurred vision", "shortness of breath", "chest pain",
    "joint pain", "dizziness", "insomnia", "persistent cough",
    "cold hands", "night sweats", "weight loss"
]

# Function to read report text
def extract_text_from_txt(filepath):
    with open(filepath, "r") as f:
        return f.read()

# Function to extract symptoms
def extract_symptoms_from_text(text):
    symptoms_found = [symptom for symptom in SYMPTOM_LIST if symptom in text.lower()]
    return "|".join(symptoms_found)

# Function to recommend medicine and doctor
def recommend_treatment(disease_name):
    row = df[df['disease'] == disease_name].iloc[0]
    meds = row['medicine'].split('|')
    specialist = row['doctor_specialist']
    return meds, specialist

# === Run on one sample report ===
report_path = "report_1.txt"  # Change as needed
text = extract_text_from_txt(report_path)
# report_path1 = "report_2.txt"  # Change as needed
# text = extract_text_from_txt(report_path1)
print("Extracted Text:\n", text)

symptoms = extract_symptoms_from_text(text)
print("\Extracted Symptoms:", symptoms)

predicted_disease = model.predict([symptoms])[0]
print(" Predicted Disease:", predicted_disease)

meds, doctor = recommend_treatment(predicted_disease)
print("Recommended Medicines:", meds)
print("Consult Specialist:", doctor)

Extracted Text:
 Patient Name: Ravi Sharma
    Age: 27
    Gender: Male
    Symptoms: weight loss, blurred vision, fever, fatigue, cough
    Blood Pressure: 142/82
\Extracted Symptoms: fever|cough|fatigue|blurred vision|weight loss
 Predicted Disease: COVID-19
Recommended Medicines: ['Remdesivir', 'Azithromycin']
Consult Specialist: Infectious Disease Specialist


  print("\Extracted Symptoms:", symptoms)


In [None]:
# Extract all unique symptoms across the dataset
all_symptoms = set()

# Split comma-separated symptoms and clean
for symptom_list in df["symptoms"]:
    for s in symptom_list.split(","):
        all_symptoms.add(s.strip().lower())

# Sort symptom list for consistent feature order
all_symptoms = sorted(list(all_symptoms))
print(f"Total unique symptoms: {len(all_symptoms)}")


Total unique symptoms: 94


In [None]:
# Create multi-hot encoded features
def encode_symptoms(symptom_str):
    symptoms = set(s.strip().lower() for s in symptom_str.split(","))
    return [1 if symptom in symptoms else 0 for symptom in all_symptoms]

# Apply to all rows
X = df["symptoms"].apply(encode_symptoms).tolist()

# Target labels
y = df["disease"]

import numpy as np
X = np.array(X)


In [None]:
print("Feature matrix shape:", X.shape)
print("Target vector shape:", len(y))


Feature matrix shape: (100, 94)
Target vector shape: 100


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)

# Evaluation report
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

#  Confusion matrix (optional)
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Classification Report:

                  precision    recall  f1-score   support

          Anemia       0.00      0.00      0.00         1
       Arthritis       0.00      0.00      0.00         1
          Asthma       0.00      0.00      0.00         1
        COVID-19       0.22      1.00      0.36         4
      Depression       0.00      0.00      0.00         3
        Diabetes       1.00      1.00      1.00         1
    Hypertension       0.00      0.00      0.00         2
        Migraine       0.00      0.00      0.00         1
Thyroid Disorder       1.00      0.25      0.40         4
    Tuberculosis       0.00      0.00      0.00         2

        accuracy                           0.30        20
       macro avg       0.22      0.23      0.18        20
    weighted avg       0.29      0.30      0.20        20

Confusion Matrix:

[[0 0 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 0 0 4 0 0 0 0 0 0]
 [0 0 0 3 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import os

def extract_symptoms_from_text(text, symptom_list):
    found_symptoms = []
    for symptom in symptom_list:
        if symptom in text.lower():
            found_symptoms.append(symptom)
    return found_symptoms

# Test with one report
report_path = "report_2.txt"

with open(report_path, "r") as file:
    report_text = file.read()

extracted_symptoms = extract_symptoms_from_text(report_text, all_symptoms)

# print("Sample symptoms in dataset:", all_symptoms[:])

# with open("report_1.txt", "r") as file:
#     report_text = file.read()

print(" Report text:\n", report_text)




 Report text:
 Patient Name: Neha Patel
    Age: 21
    Gender: Female
    Symptoms: nausea, shortness of breath, cold hands
    Blood Pressure: 132/81


In [None]:
#  Convert extracted symptoms to multi-hot encoded input
def encode_extracted_symptoms(symptoms_found):
    return [1 if symptom in symptoms_found else 0 for symptom in all_symptoms]

#  Encode symptoms
X_new = np.array([encode_extracted_symptoms(extracted_symptoms)])

#  Predict disease
predicted_disease = model.predict(X_new)[0]
print("Predicted Disease:", predicted_disease)

# Recommend medicine and specialist
recommendation = df[df["disease"] == predicted_disease].iloc[0]
print("Recommended Medicine:", recommendation["medicine"])
print("Specialist to Consult:", recommendation["doctor_specialist"])


Predicted Disease: COVID-19
Recommended Medicine: Remdesivir|Azithromycin
Specialist to Consult: Infectious Disease Specialist
