In [5]:
import pandas as pd
import numpy as np 
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("medquad.csv")
print(df.shape)

(16412, 4)


In [7]:
import pandas as pd
import plotly.graph_objects as go

value_counts = df['focus_area'].value_counts()
result = value_counts[value_counts > 15]

fig = go.Figure(data=[go.Pie(labels=result.index, values=result.values)])
fig.update_layout(title='Pie Chart of Focus Areas', width=1200, height=800, font=dict(size=16))
fig.show()

In [8]:
import random
color_codes = {
    "blue": 34,
    "green": 32,
    "red": 31,
    "purple": 35,
    "orange": 33,  
    "yellow": 33,
    "pink": 35,   
    "brown": 33,  
    "gray": 37
}

for i in range(0, len(df), 7):
    color = random.choice(list(color_codes.values()))
    print(f"\033[1;{color}mThe question is: {df['question'][i]}\033[0m\n\033[1;{color}m The answer is: {df['answer'][i]}\033[0m\n")
    if i > 30:
        break

[1;35mThe question is: What is (are) Glaucoma ?[0m
[1;35m The answer is: Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma, for still unknown reasons, the fluid drains too slowly out of the eye. As the fluid builds up, the pressure inside the eye rises. Unless this pressure is controlled, it may cause damage to the optic nerve and other parts of the eye and resul

In [9]:
unique_focus_area = df['focus_area'][:500].unique()
print(f"\033[34m\033[1m{unique_focus_area}\033[0m")

[34m[1m['Glaucoma' 'High Blood Pressure' "Paget's Disease of Bone"
 'Urinary Tract Infections' 'Alcohol Use and Older Adults'
 'Osteoarthritis' 'Problems with Taste' 'Anxiety Disorders' 'Diabetes'
 'Medicare and Continuing Care' 'Knee Replacement' 'Balance Problems'
 'Quitting Smoking for Older Adults' 'Prostate Cancer' 'Dry Mouth'
 'Osteoporosis' 'Kidney Disease' "Alzheimer's Disease"
 'Rheumatoid Arthritis' 'Hearing Loss' 'Low Vision' 'COPD'
 'Age-related Macular Degeneration' 'Diabetic Retinopathy' 'Depression'
 'Problems with Smell' 'Breast Cancer' 'Colorectal Cancer'
 "Parkinson's Disease" 'Leukemia' 'Lung Cancer' 'Urinary Incontinence'][0m


In [10]:
import shutil

def get_terminal_width():
    """Get the width of the terminal."""
    return shutil.get_terminal_size().columns

def wrap_text(text, width):
    """Wrap text to fit within a given width."""
    words = text.split()
    lines = []
    current_line = []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > width:
            lines.append(' '.join(current_line))
            current_line = [word]
            current_length = len(word)
        else:
            current_line.append(word)
            current_length += len(word) + 1
    lines.append(' '.join(current_line))
    return lines

def display_full_width_row(df, index):
    """Display a row from the DataFrame with full-width answer and alternate coloring."""
    question = df['question'][index]
    answer = df['answer'][index]
    focus_area = df['focus_area'][index]
    
    width = get_terminal_width()
    question_width = width - 12  # Adjust for "Question: " prefix
    answer_width = width - 16    # Adjust for "Answer: " prefix
    focus_width = width - 14     # Adjust for "Focus Area: " prefix
    
    # Wrap text to fit within the calculated widths
    wrapped_question = wrap_text(question, question_width)
    wrapped_answer = wrap_text(answer, answer_width)
    wrapped_focus = wrap_text(focus_area, focus_width)
    
    # Print with alternate coloring using background colors
    print("\033[45m" + "Question:".ljust(width) + "\033[0m")  # Magenta background
    for line in wrapped_question:
        print(line)
    print()
    
    print("\033[46m" + "Answer:".ljust(width) + "\033[0m")    # Cyan background
    for line in wrapped_answer:
        print(line)
    print()
    
    print("\033[43m" + "Focus Area:".ljust(width) + "\033[0m")  # Yellow background
    for line in wrapped_focus:
        print(line)
    print()

display_full_width_row(df, 1)

[45mQuestion:                                                                       [0m
What causes Glaucoma ?

[46mAnswer:                                                                         [0m
Nearly 2.7 million people have glaucoma, a leading cause of
blindness in the United States. Although anyone can get
glaucoma, some people are at higher risk. They include -
African-Americans over age 40 - everyone over age 60, especially
Hispanics/Latinos - people with a family history of glaucoma.
African-Americans over age 40 everyone over age 60, especially
Hispanics/Latinos people with a family history of glaucoma. In
addition to age, eye pressure is a risk factor. Whether you
develop glaucoma depends on the level of pressure your optic
nerve can tolerate without being damaged. This level is
different for each person. Thats why a comprehensive dilated eye
exam is very important. It can help your eye care professional
determine what level of eye pressure is normal for you. Another
r

In [11]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import warnings
warnings.filterwarnings('ignore')

# ANSI escape codes for text color
RED = '\033[91m'
RESET = '\033[0m'

# Load the dataset
print("Dataset loaded successfully.")

# Initialize the model and tokenizer
print("Initializing the model and tokenizer...")
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
print("Model and tokenizer initialized.")

def answer_question(question, context):
    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", max_length=512, truncation=True)
    
    # Get the model's answer
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Process the output
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    # Get the tokens for the entire input
    all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    # Extract the answer tokens
    answer_tokens = all_tokens[answer_start:answer_end]
    
    # Convert answer tokens back to text
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    
    # Clean up the answer
    answer = answer.strip()
    
    # If the answer is empty or just punctuation, return a message
    if not answer or answer.strip('.,;:!?') == '':
        return "I'm sorry, I couldn't find a specific answer in the given context."
    
    return answer

def print_colored_answer(answer):
    print(f"{RED}Answer:{RESET}")
    print(f"{RED}{'=' * 50}{RESET}")
    print(f"{RED}{answer}{RESET}")
    print(f"{RED}{'=' * 50}{RESET}")

print("\nProcessing the first question from the dataset as an example:")
# Example usage with the first row of the dataset
first_row = df.iloc[0]
question = first_row['question']
context = first_row['answer']

print(f"Question: {question}")
print(f"Context: {context}")  # Print the full context

answer = answer_question(question, context)
print_colored_answer(answer)
print()

print("Now entering interactive mode. You can ask your own medical questions.")
# Interactive loop for user questions
while True:
    user_question = input("Enter your medical question (or 'quit' to exit): ")
    if user_question.lower() == 'quit':
        break
    
    # Find the most relevant context (simple approach: exact match)
    relevant_rows = df[df['question'].str.contains(user_question, case=False, na=False)]
    
    if not relevant_rows.empty:
        for _, row in relevant_rows.iterrows():
            context = row['answer']
            print(f"\nFound relevant context: {context[:100]}...")  # Print first 100 characters of context
            print("Full context:")
            print(context)
            answer = answer_question(user_question, context)
            print_colored_answer(answer)
            
            user_input = input("Is this answer helpful? (yes/no): ").lower()
            if user_input == 'yes':
                break
        else:
            print("Sorry, none of the answers were helpful.")
    else:
        print("Sorry, I couldn't find a relevant context for your question.")
    print()

print(f"{RESET}Thank you for using the MedQuAD Question Answering System!")

Dataset loaded successfully.
Initializing the model and tokenizer...


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model and tokenizer initialized.

Processing the first question from the dataset as an example:
Question: What is (are) Glaucoma ?
Context: Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma, for still unknown reasons, the fluid drains too slowly out of the eye. As the fluid builds up, the pressure inside the eye rises. Unless this pressure is controlled, it may caus