# Import required library

In [13]:
import pandas as pd
import numpy as np
import re
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Load the dataset

In [16]:
data = pd.read_csv(r'C:\Users\shary\OneDrive\Desktop\GenAI Project\medimind_india_raw_data.csv')

In [17]:
data.head()

Unnamed: 0,disease,age,gender,region,symptoms,medical_history,test_results,differential_diagnoses,medications,treatment_plan,follow_ups
0,stroke,5,female,urban,sudden weakness; facial droop,atrial fibrillation,CT brain: ischemic stroke,"[{'name': 'Ischemic stroke', 'probability': 0....",Aspirin 300 mg stat; Atenolol for BP; Alteplas...,Thrombolytic therapy if indicated; BP management,MRI brain in 24 hours; Neurology consult
1,dengue,30,male,rural,rash; muscle pain; headache,recent travel to endemic area,NS1 antigen: positive; hematocrit: elevated,"[{'name': 'Dengue fever', 'probability': 0.88}...",Avoid NSAIDs; Paracetamol 500 mg for fever,Monitor platelet count; IV fluids,Reassess in 48 hours; Repeat platelet count daily
2,copd,47,female,urban,wheezing; fatigue; chronic cough,smoking; chronic bronchitis; occupational dust...,spirometry: FEV1 67%; O2 saturation: 86%,"[{'name': 'COPD', 'probability': 0.85}, {'name...",Tiotropium 18 mcg daily; Salbutamol inhaler 20...,Pulmonary rehabilitation; Smoking cessation,Pulmonary function test in 3 months; Monitor O...
3,tuberculosis,69,male,rural,hemoptysis; fever; cough,HIV; close contact with TB patient; malnutriti...,chest X-ray: infiltrates,"[{'name': 'Tuberculosis', 'probability': 0.85}...",Rifampicin 10 mg/kg; Isoniazid 5 mg/kg; Ethamb...,DOTS therapy; Nutritional support,Chest X-ray in 6 months; Sputum test in 2 months
4,stroke,45,female,urban,slurred speech; facial droop,atrial fibrillation; smoking; diabetes,CT brain: ischemic stroke; glucose: 185 mg/dL,"[{'name': 'Ischemic stroke', 'probability': 0....",Alteplase if within 4.5 hours; Aspirin 300 mg ...,Thrombolytic therapy if indicated; BP management,Neurology consult; MRI brain in 24 hours


# Data Preprocessing

In [19]:
def preprocess_data(df):
    # Clean and parse symptoms
    df['symptoms'] = df['symptoms'].apply(lambda x: [s.strip().lower() for s in str(x).split(';')])

    # Clean and parse medical history
    df['medical_history'] = df['medical_history'].apply(lambda x: [m.strip().lower() for m in str(x).split(';')] if pd.notna(x) else [])
    
    # Parse differential diagnoses (stored as string representation of list of dicts)
    def parse_differential(diff_str):
        try:
            # Clean the string and convert to proper JSON format
            diff_str = diff_str.replace("'", '"')
            diff_str = diff_str.replace("None", "null")
            diff_list = json.loads(diff_str)
            return [d['name'] for d in diff_list if d['probability'] > 0.1]  # Only include diagnoses with >10% probability
        except:
            return []
        
    df['differential_diagnoses'] = df['differential_diagnoses'].apply(parse_differential)
    
    # Create binary columns for common symptoms
    all_symptoms = set()
    for symptoms in df['symptoms']:
        all_symptoms.update(symptoms)
    
    for symptom in all_symptoms:
        df[f'symptom_{symptom.replace(" ", "_")}'] = df['symptoms'].apply(lambda x: 1 if symptom in x else 0)
    
    return df

data = preprocess_data(data)
    

# Feature Engineering

In [None]:
def create_features(df):
    # Age bins
    df['age_group'] = pd.cut(df['age'], bins=[0, 12, 19, 40, 60, 100], 
                            labels=['child', 'teen', 'young_adult', 'middle_aged', 'senior'])
    
    # Gender encoding
    df['is_male'] = df['gender'].apply(lambda x: 1 if x == 'male' else 0)
    df['is_female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)
    
    # Region encoding
    df['is_urban'] = df['region'].apply(lambda x: 1 if x == 'urban' else 0)
    
    # Medical history flags
    medical_conditions = ['diabetes', 'hypertension', 'smoking', 'hiv', 'atrial fibrillation', 
                         'chronic bronchitis', 'obesity', 'malnutrition']
    
    for condition in medical_conditions:
        df[f'has_{condition.replace(" ", "_")}'] = df['medical_history'].apply(
            lambda x: 1 if any(condition in m for m in x) else 0)
    
    return df

data = create_features(data)

medical_conditions = [col.replace('has_', '').replace('_', ' ')
                      for col in data.columns if col.startswith('has_')]   ########## gpt solution for medical condition not define 

# Prepare data for ML models
symptom_cols = [col for col in data.columns if col.startswith('symptom_')]
feature_cols = symptom_cols + ['age', 'is_male', 'is_female', 'is_urban'] + \
              [f'has_{cond.replace(" ", "_")}' for cond in medical_conditions]


# Multi-label target (differential diagnoses)
mlb = MultiLabelBinarizer()
diagnoses_encoded = mlb.fit_transform(data['differential_diagnoses'])
diagnoses_classes = mlb.classes_

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    data[feature_cols], diagnoses_encoded, test_size=0.2, random_state=42)

# Traditional ML Model (Random Forest)
rf_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_model.fit(X_train, y_train)

# Deep Learning Model

In [26]:
def build_dl_model(input_shape, output_shape):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(output_shape, activation='sigmoid')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
    return model

dl_model = build_dl_model(len(feature_cols), len(diagnoses_classes))
dl_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)


Epoch 1/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.2760 - loss: 0.4201 - val_accuracy: 0.9837 - val_loss: 0.0369
Epoch 2/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9558 - loss: 0.0411 - val_accuracy: 0.9987 - val_loss: 0.0044
Epoch 3/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9929 - loss: 0.0083 - val_accuracy: 0.9962 - val_loss: 0.0022
Epoch 4/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9953 - loss: 0.0051 - val_accuracy: 0.9975 - val_loss: 0.0015
Epoch 5/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9942 - loss: 0.0041 - val_accuracy: 0.9950 - val_loss: 0.0022
Epoch 6/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9955 - loss: 0.0031 - val_accuracy: 0.9937 - val_loss: 0.0019
Epoch 7/20
[1m225/225[0m 

<keras.src.callbacks.history.History at 0x206c73cb6d0>

# Transformer-based Model (using a smaller version for demonstration)

In [29]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
transformer_model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(diagnoses_classes))

# Symptom Analysis Engine
class SymptomAnalyzer:
    def __init__(self, data):
        self.data = data
        self.symptom_vectorizer = TfidfVectorizer()
        self.symptom_corpus = [' '.join(symptoms) for symptoms in data['symptoms']]
        self.symptom_vectorizer.fit(self.symptom_corpus)

            
    def analyze_symptoms(self, symptoms):
        # Convert symptoms to TF-IDF vector
        symptoms_text = ' '.join(symptoms)
        vector = self.symptom_vectorizer.transform([symptoms_text])
        
        # Get similar cases
        from sklearn.metrics.pairwise import cosine_similarity
        corpus_vectors = self.symptom_vectorizer.transform(self.symptom_corpus)
        similarities = cosine_similarity(vector, corpus_vectors)
        similar_indices = similarities.argsort()[0][-5:][::-1]  # Top 5 most similar cases
        
        similar_cases = self.data.iloc[similar_indices]
        return similar_cases

symptom_analyzer = SymptomAnalyzer(data)




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# GenAI Model (For Clinical Decision Support System) 

In [32]:
class ClinicalDecisionSupport:
    def __init__(self, rf_model, dl_model, transformer_model, tokenizer, mlb):
        self.rf_model = rf_model
        self.dl_model = dl_model
        self.transformer_model = transformer_model
        self.tokenizer = tokenizer
        self.mlb = mlb
        self.diagnosis_explainer = pipeline("text-generation", model="gpt2")
        
    def predict_diagnoses(self, patient_data):
        # Prepare features
        features = self._prepare_features(patient_data)
        
        # Get predictions from all models
        rf_pred = self.rf_model.predict([features])[0]
        dl_pred = (self.dl_model.predict(np.array([features])) > 0.5).astype(int)[0]

        # Combine predictions (simple voting)
        combined_pred = (rf_pred + dl_pred) >= 1
        
        # Get diagnosis names
        #diagnoses = self.mlb.inverse_transform([combined_pred])[0]
        diagnoses = self.mlb.inverse_transform(np.array([combined_pred]))[0]
        return diagnoses
    
    def explain_diagnosis(self, diagnosis):
        prompt = f"Explain the diagnosis {diagnosis} in simple terms for a patient, including common symptoms, causes, and general treatment approaches:"
        explanation = self.diagnosis_explainer(prompt, max_length=200, do_sample=True)
        return explanation[0]['generated_text']
    
    def generate_treatment_plan(self, diagnosis, patient_info):
        prompt = f"Generate a detailed treatment plan for a {patient_info['age']}-year-old {patient_info['gender']} patient with {diagnosis}. "
        prompt += f"The patient has the following medical history: {', '.join(patient_info['medical_history'])}. "
        prompt += "Include medications, lifestyle recommendations, and follow-up instructions:"
        
        treatment_plan = self.diagnosis_explainer(prompt, max_length=300, do_sample=True)
        return treatment_plan[0]['generated_text']
    
    def _prepare_features(self, patient_data):
        # Convert patient data to feature vector matching training data format
        features = []

        # Add symptom flags
        for col in symptom_cols:
            symptom = col.replace('symptom_', '').replace('_', ' ')
            features.append(1 if symptom in patient_data['symptoms'] else 0)
        
        # Add demographic features
        features.append(patient_data['age'])
        features.append(1 if patient_data['gender'] == 'male' else 0)
        features.append(1 if patient_data['gender'] == 'female' else 0)
        features.append(1 if patient_data['region'] == 'urban' else 0)
        
        # Add medical history flags
        for condition in medical_conditions:
            col = f'has_{condition.replace(" ", "_")}'
            features.append(1 if any(condition in mh.lower() for mh in patient_data['medical_history']) else 0)
        
        return features

cdss = ClinicalDecisionSupport(rf_model, dl_model, transformer_model, tokenizer, mlb)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Device set to use 0


# Example Usage

In [33]:
if __name__ == "__main__":
    # Example patient
    patient = {
        'age': 45,
        'gender': 'male',
        'region': 'urban',
        'symptoms': ['headache', 'facial droop', 'sudden weakness'],
        'medical_history': ['hypertension', 'smoking', 'diabetes']
    }
    
    print("\n=== Symptom Analysis ===")
    similar_cases = symptom_analyzer.analyze_symptoms(patient['symptoms'])
    print("Similar historical cases:")
    print(similar_cases[['disease', 'age', 'gender', 'symptoms', 'medical_history']].head(3))
    
    print("\n=== Differential Diagnosis ===")
    diagnoses = cdss.predict_diagnoses(patient)
    print("Predicted diagnoses:", diagnoses)
    
    if len(diagnoses) > 0:
        print("\n=== Diagnosis Explanation ===")
        explanation = cdss.explain_diagnosis(diagnoses[0])
        print(explanation)
        
        print("\n=== Treatment Plan ===")
        treatment = cdss.generate_treatment_plan(diagnoses[0], patient)
        print(treatment)


=== Symptom Analysis ===
Similar historical cases:
     disease  age  gender                                   symptoms  \
1043  stroke   19  female  [facial droop, sudden weakness, headache]   
3940  stroke   51  female  [headache, sudden weakness, facial droop]   
772   stroke   37  female  [sudden weakness, headache, facial droop]   

                                        medical_history  
1043                                [smoking, diabetes]  
3940       [smoking, hypertension, atrial fibrillation]  
772   [smoking, hypertension, diabetes, atrial fibri...  

=== Differential Diagnosis ===
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted diagnoses: ('Ischemic stroke',)

=== Diagnosis Explanation ===
Explain the diagnosis Ischemic stroke in simple terms for a patient, including common symptoms, causes, and general treatment approaches:

Seizure is a serious adverse event, including an emergency meeting of acute care and medical and technical needs.

Isolated strokes typically cause minimal mortality but result of severe, life threatening long-term disability.

Clotting, in contrast to peripheral stroke, is a primary cause of neurologic symptoms but also can also occur for all stroke disorders. This is due to lack of any major abnormalities or anomalies in the central nervous system.

The primary endpoint of neurological pathology results from stroke as a result of a complete failure of the central nervous system to recognize the absence of an abnormally high level of movement due to non-therapeutic factors or events such as hypoventilation/high blood pressure.

It requires a critical understanding of the diagn

# Evaluate models

In [None]:

print("Random Forest Performance:")
rf_pred = rf_model.predict(X_test)
print(classification_report(y_test, rf_pred, target_names=diagnoses_classes, zero_division=0))

print("\nDeep Learning Performance:")
dl_pred = (dl_model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, dl_pred, target_names=diagnoses_classes, zero_division=0))

Random Forest Performance:
                             precision    recall  f1-score   support

Acute myocardial infarction       1.00      1.00      1.00       248
                       COPD       1.00      1.00      1.00       157
                   COVID-19       0.99      0.99      0.99       159
     Chronic kidney disease       1.00      1.00      1.00       146
               Dengue fever       1.00      1.00      1.00       206
             HMPV infection       0.99      0.99      0.99        98
                Hepatitis B       1.00      1.00      1.00       178
            Ischemic stroke       1.00      1.00      1.00       183
               Tuberculosis       1.00      1.00      1.00       275
   Type 2 diabetes mellitus       1.00      1.00      1.00       350

                  micro avg       1.00      1.00      1.00      2000
                  macro avg       1.00      1.00      1.00      2000
               weighted avg       1.00      1.00      1.00      2000
     