In [2]:
# Standard Libraries
import os
import re
import subprocess
import traceback
import pickle
from collections import Counter

# Data Handling and Processing
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns  

# Scikit-Learn: Preprocessing and Model Selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Scikit-Learn: Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Scikit-Learn: Evaluation Metrics
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

# Scikit-Learn: Class Weights
from sklearn.utils.class_weight import compute_class_weight

# Scikit-Learn: Feature Engineering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Imbalanced Data Handling
from imblearn.combine import SMOTEENN

# TensorFlow & Keras
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Dense, Dropout, Concatenate, BatchNormalization
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Joblib (for saving/loading models)
import joblib



In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/armandaraujo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# #read in .json file and convert to csv
# import json

# # Load the JSON file
# with open('Pivot_Resource/SQuAD_dataset.json', 'r') as file:
#     data = json.load(file)

# # Convert the JSON data to a pandas DataFrame
# df = pd.DataFrame(data)

# # Display the first few rows of the DataFrame
# df.head()



In [3]:
prognosis_df = pd.read_csv('Pivot_Resource/prognosis_df.csv')


In [None]:
# prognosis_df = pd.read_csv('Final_Final_Resources/prognosis_df.csv')


In [4]:
prognosis_df.head()

Unnamed: 0,Prognosis,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#read in .json file and convert to csv
import json

# Load the JSON file
with open('Pivot_Resource/HealthCareMagic-100k.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
df.head()

In [None]:
# save the dataframe to a csv file
df.to_csv('Pivot_Resource/HealthCareMagic-100k.csv', index=False)


In [6]:
healthCareMagic_df = pd.read_csv('Pivot_Resource/HealthCareMagic-100k.csv')


In [7]:
healthCareMagic_df.head()

Unnamed: 0,instruction,input,output
0,"If you are a doctor, please answer the medical...",I woke up this morning feeling the whole room ...,"Hi, Thank you for posting your query. The most..."
1,"If you are a doctor, please answer the medical...",My baby has been pooing 5-6 times a day for a ...,Hi... Thank you for consulting in Chat Doctor....
2,"If you are a doctor, please answer the medical...","Hello, My husband is taking Oxycodone due to a...","Hello, and I hope I can help you today.First, ..."
3,"If you are a doctor, please answer the medical...",lump under left nipple and stomach pain (male)...,HI. You have two different problems. The lump ...
4,"If you are a doctor, please answer the medical...",I have a 5 month old baby who is very congeste...,Thank you for using Chat Doctor. I would sugge...


In [8]:
# Set display options for full view
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)  # This will show full content of each cell
pd.set_option('display.expand_frame_repr', False)  # Don't wrap long strings

# Display the dataframe
healthCareMagic_df.head(100)

Unnamed: 0,instruction,input,output
0,"If you are a doctor, please answer the medical questions based on the patient's description.","I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!","Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional vertigo (BPPV), a type of peripheral vertigo. In this condition, the most common symptom is dizziness or giddiness, which is made worse with movements. Accompanying nausea and vomiting are common. The condition is due to problem in the ear, and improves in a few days on own. Betahistine tablets would help relieve your symptoms. Doing vestibular rehabilitation or adaptation exercises would prevent the recurrence of these symptoms. An ENT evaluation would also help. I hope it helps. Best wishes, Chat Doctor."
1,"If you are a doctor, please answer the medical questions based on the patient's description.",My baby has been pooing 5-6 times a day for a week. In the last few days it has increased to 7 and they are very watery with green stringy bits in them. He does not seem unwell i.e no temperature and still eating. He now has a very bad nappy rash from the pooing ...help!,Hi... Thank you for consulting in Chat Doctor. It seems your kid is having viral diarrhea. Once it starts it will take 5-7 days to completely get better. Unless the kids having low urine output or very dull or excessively sleepy or blood in motion or green bilious vomiting...you need not worry. There is no need to use antibiotics unless there is blood in the motion. Antibiotics might worsen if unnecessarily used causing antibiotic associated diarrhea. I suggest you use zinc supplements (Z&D Chat Doctor.
2,"If you are a doctor, please answer the medical questions based on the patient's description.","Hello, My husband is taking Oxycodone due to a broken leg/surgery. He has been taking this pain medication for one month. We are trying to conceive our second baby. Will this medication afect the fetus? Or the health of the baby? Or can it bring birth defects? Thank you.","Hello, and I hope I can help you today.First, there is no medication that can be taken by the father that has any way to get into your system or a baby if you conceive. Medications can only affect a fetus if you take it while pregnant. The only issue is that certain medications may decrease a men sperm count and affect fertility, however pain medications like Oxycodone do not have this effect. So there is no reason for you to worry about conceiving while taking this medication. The best way you can prepare for a healthy pregnancy is to follow a well-balanced diet, limit alcohol consumption and avoid cigarette smoke, and take a daily prenatal vitamin or folic acid, as folic acid supplements in early pregnancy helps to prevent certain types of birth defects. I hope this answers your question and best wishes for your upcoming pregnancy,"
3,"If you are a doctor, please answer the medical questions based on the patient's description.","lump under left nipple and stomach pain (male) Hi,I have recently noticed a few weeks ago a lump under my nipple, it hurts to touch and is about the size of a quarter. Also I have bern experiencing stomach pains that prevent me from eating. I immediatly feel full and have extreme pain. Please help","HI. You have two different problems. The lump under the nipple should be removed, biopsied. This will help you to get rid of the disease, and you get a diagnosis. Second problem looks a bit serious one: Pain that prevents you from eating-immediate fullness, and extreme pain. I would advise such a patient to undergo the upper GI endoscopy asap. This can be an ulcer with a problem at the pylorus. The exact diagnosis is the most important to start the right therapy. Do not waste time in starting the therapy."
4,"If you are a doctor, please answer the medical questions based on the patient's description.",I have a 5 month old baby who is very congested with a terrible cough. Its rattly/raspy and croupy sounding cough. She started choking on her coughs and the mucous that has come up. She also has a fever and runny nose. Should i take her to urgent care?,"Thank you for using Chat Doctor. I would suggest that you see your doctor. Your baby maybe having bronchiolitis which is a lung infection common to your kids age. It is commonly caused by a virus. Albuterol via nebulization should be utilized in order to alleviate the wheezing and also help with the congestion. A decongestant can also be used for the colds. Also, it would also advise doing a chest X-ray in order to rule out other diseases (ex. pneumonia)sincerely, Mark RosarioGeneral pediatrics/Pediatric Pulmonology"


In [None]:
healthCareMagic_df.shape

In [5]:
cleanCareMagic_df = pd.read_csv('Pivot_Resource/cleaned_healthcare_data.csv')


In [None]:
def clean_text(text):
    """Clean and preprocess text data"""
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove special characters but keep essential punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?()-]', '', text)
    
    # Standardize some medical terms (add more as needed)
    text = text.replace('doc', 'doctor')
    text = text.replace('meds', 'medications')
    
    return text

In [None]:


def prepare_dataset(df):
    """Prepare the dataset for chatbot training"""
    # Create a copy to avoid modifying original data
    df_clean = df.copy()
    
    # Remove rows where any of the important columns are empty
    df_clean = df_clean.dropna(subset=['instruction', 'input', 'output'])
    
    # Clean the text in each column
    df_clean['instruction'] = df_clean['instruction'].apply(clean_text)
    df_clean['input'] = df_clean['input'].apply(clean_text)
    df_clean['output'] = df_clean['output'].apply(clean_text)
    
    # Remove very short or empty responses
    df_clean = df_clean[df_clean['input'].str.len() > 10]
    df_clean = df_clean[df_clean['output'].str.len() > 10]
    
    # Create a combined prompt format
    df_clean['combined_input'] = df_clean.apply(
        lambda x: f"{x['instruction']}\n\nPatient: {x['input']}\n\nDoctor:", axis=1
    )
    
    return df_clean

# Apply the cleaning
cleaned_df = prepare_dataset(healthCareMagic_df)

# Display a sample to verify the cleaning
print("Sample cleaned conversation:")
print("\nCombined Input:")
print(cleaned_df['combined_input'].iloc[0])
print("\nOutput:")
print(cleaned_df['output'].iloc[0])

# Save cleaned dataset
cleaned_df.to_csv('Pivot_Resource/cleaned_healthcare_data.csv', index=False)

# Print some statistics
print("\nDataset Statistics:")
print(f"Original rows: {len(healthCareMagic_df)}")
print(f"Cleaned rows: {len(cleaned_df)}")
print(f"Average input length: {cleaned_df['input'].str.len().mean():.0f} characters")
print(f"Average output length: {cleaned_df['output'].str.len().mean():.0f} characters")

In [None]:
# Optional: Add more sophisticated cleaning
nltk.download('punkt')

def advanced_clean(df):
    # Remove duplicate conversations
    df = df.drop_duplicates(subset=['input', 'output'])
    
    # Remove conversations where input and output are too similar
    from difflib import SequenceMatcher
    def similarity_ratio(row):
        return SequenceMatcher(None, row['input'], row['output']).ratio()
    
    df['similarity'] = df.apply(similarity_ratio, axis=1)
    df = df[df['similarity'] < 0.8]  # Remove where input/output are too similar
    df = df.drop('similarity', axis=1)
    
    # Add conversation length features
    df['input_word_count'] = df['input'].apply(lambda x: len(word_tokenize(x)))
    df['output_word_count'] = df['output'].apply(lambda x: len(word_tokenize(x)))
    
    # Filter out extremely long or short conversations
    df = df[
        (df['input_word_count'] >= 5) & 
        (df['input_word_count'] <= 500) &
        (df['output_word_count'] >= 5) & 
        (df['output_word_count'] <= 500)
    ]
    
    return df

# Apply advanced cleaning if needed
cleaned_df = advanced_clean(cleaned_df)

In [None]:
# Display random samples to verify quality
print("Random sample of cleaned conversations:")
random_samples = cleaned_df.sample(n=3)
for idx, row in random_samples.iterrows():
    print("\n---Conversation---")
    print("Input:", row['combined_input'])
    print("\nOutput:", row['output'])
    print("\n")

In [None]:
def prepare_input(user_symptoms):
    """Prepare user input in the same format as training data"""
    instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
    formatted_input = f"{instruction}\n\nPatient: {user_symptoms}\n\nDoctor:"
    return formatted_input


In [None]:
def get_medical_response(user_symptoms, model, input_tokenizer, output_tokenizer, maxlen_input=512):
    """Generate medical response for given symptoms"""
    try:
        # Prepare the input
        formatted_input = prepare_input(user_symptoms)
        
        # Tokenize and pad the input
        input_seq = input_tokenizer.texts_to_sequences([formatted_input])
        input_pad = pad_sequences(input_seq, maxlen=maxlen_input, padding='post')
        
        # Generate prediction
        predicted_seq = model.predict(input_pad, verbose=0)[0]
        
        # Convert prediction to text
        predicted_text = ""
        for idx in predicted_seq.argmax(axis=-1):
            if idx > 0:  # Skip padding
                word = output_tokenizer.index_word.get(idx, '')
                if word:
                    predicted_text += word + ' '
        
        # Clean up the response
        response = predicted_text.strip()
        
        # Add disclaimer
        disclaimer = ("\n\nNOTE: This is an AI-generated response and should not be considered "
                     "as professional medical advice. Please consult with a qualified healthcare "
                     "provider for proper diagnosis and treatment.")
        
        return response + disclaimer
    
    except Exception as e:
        return (f"I apologize, but I encountered an error processing your request. "
                f"Please consult with a healthcare provider. Error: {str(e)}")

# Create an interactive chat interface
def medical_chat():
    print("Medical Symptom Chatbot (Type 'quit' to exit)")
    print("Please describe your symptoms:")
    
    while True:
        user_input = input("\nYou: ")
        
        if user_input.lower() in ['quit', 'exit', 'bye']:
            print("\nThank you for using the Medical Symptom Chatbot. Take care!")
            break
            
        if len(user_input.strip()) < 10:
            print("\nBot: Please provide more details about your symptoms for a better assessment.")
            continue
            
        response = get_medical_response(user_input, model, input_tokenizer, output_tokenizer)
        print("\nBot:", response)

In [None]:
def preprocess_symptoms(symptoms):
    """Preprocess and validate symptom description"""
    # Convert to lowercase
    symptoms = symptoms.lower()
    
    # Remove extra whitespace
    symptoms = ' '.join(symptoms.split())
    
    # Basic spell checking could be added here
    
    # Remove any obvious non-medical terms (could be expanded)
    non_medical_terms = ['hi', 'hello', 'hey', 'thanks', 'thank you']
    symptoms = ' '.join(word for word in symptoms.split() 
                       if word not in non_medical_terms)
    
    return symptoms

In [None]:
def validate_input(symptoms):
    """Validate the input symptoms"""
    if len(symptoms.split()) < 3:
        return False, "Please provide more details about your symptoms."
    
    if len(symptoms) > 500:
        return False, "Please provide a more concise description of your symptoms."
    
    return True, ""

In [None]:
# Enhanced chat interface with input validation
def enhanced_medical_chat():
    print("\nMedical Symptom Chatbot")
    print("------------------------")
    print("Please describe your symptoms in detail.")
    print("Type 'quit' to exit.")
    print("\nImportant: This is an AI assistant and not a replacement for professional medical advice.")
    
    while True:
        try:
            user_input = input("\nYou: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'bye']:
                print("\nThank you for using the Medical Symptom Chatbot. Take care!")
                break
            
            # Preprocess input
            processed_input = preprocess_symptoms(user_input)
            
            # Validate input
            is_valid, message = validate_input(processed_input)
            if not is_valid:
                print(f"\nBot: {message}")
                continue
            
            # Get response
            response = get_medical_response(processed_input, model, input_tokenizer, output_tokenizer)
            
            print("\nBot:", response)
            
        except Exception as e:
            print("\nBot: I apologize, but I encountered an error. Please try again or seek medical attention if you're concerned.")
            print(f"Error: {str(e)}")

In [None]:
def check_emergency_symptoms(symptoms):
    """Check for emergency symptoms that require immediate medical attention"""
    emergency_keywords = [
        'chest pain', 'heart attack', 'stroke', 'unconscious', 'breathing difficulty',
        'severe bleeding', 'head injury', 'suicide', 'poisoning', 'overdose'
    ]
    
    for keyword in emergency_keywords:
        if keyword in symptoms.lower():
            return True
    return False

In [None]:
def get_emergency_message():
    return """
    EMERGENCY MEDICAL ATTENTION NEEDED
    ---------------------------------
    Based on the symptoms you've described, you should seek immediate medical attention:
    
    1. Call emergency services (911 in the US) or your local emergency number
    2. Go to the nearest emergency room
    3. Do not wait for symptoms to improve on their own
    
    This is not a situation for an AI chatbot. Please seek professional medical help immediately.
    """

In [None]:
# Enhanced chat interface with emergency detection
def safe_medical_chat():
    print("\nMedical Symptom Chatbot")
    print("------------------------")
    print("DISCLAIMER: This chatbot is for informational purposes only and is not a substitute for professional medical advice, diagnosis, or treatment.")
    print("In case of emergency, please call your local emergency services immediately.")
    print("\nPlease describe your symptoms (type 'quit' to exit):")
    
    while True:
        try:
            user_input = input("\nYou: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'bye']:
                print("\nThank you for using the Medical Symptom Chatbot. Remember to consult with healthcare professionals for proper medical advice.")
                break
            
            # Check for emergencies first
            if check_emergency_symptoms(user_input):
                print(get_emergency_message())
                continue
            
            # Regular processing
            processed_input = preprocess_symptoms(user_input)
            is_valid, message = validate_input(processed_input)
            
            if not is_valid:
                print(f"\nBot: {message}")
                continue
            
            response = get_medical_response(processed_input, model, input_tokenizer, output_tokenizer)
            print("\nBot:", response)
            
        except Exception as e:
            print("\nBot: I apologize, but I encountered an error. Please try again or consult with a healthcare provider.")
            print(f"Error: {str(e)}")

In [None]:
# # Save the model and tokenizers
# model.save('medical_chatbot_model.h5')
# with open('input_tokenizer.pkl', 'wb') as f:
#     pickle.dump(input_tokenizer, f)
# with open('output_tokenizer.pkl', 'wb') as f:
#     pickle.dump(output_tokenizer, f)

# # Start the chat interface
# safe_medical_chat()