# Import necessary libraries

In [None]:

"""
This cell imports all the required libraries for dataset handling, preprocessing, and downloading.
"""
import numpy as np
import pandas as pd
import requests
import nltk
from transformers import pipeline
import torch
import os
from datasets import load_dataset


#  Load MedQA-USMLE dataset

In [None]:

"""
Loads the MedQA-USMLE dataset and saves train and test splits to the project directory.
"""
dataset = load_dataset("GBaker/MedQA-USMLE-4-options")
base_dir = "NLP_Project/MedQA_USMLE"
os.makedirs(f"{base_dir}/train", exist_ok=True)
os.makedirs(f"{base_dir}/test", exist_ok=True)
dataset["train"].to_csv(f"{base_dir}/train/medqa_usmle_train.csv")
dataset["test"].to_csv(f"{base_dir}/test/medqa_usmle_test.csv")


# Download files from PubMedQA repository

In [None]:
"""
Fetches data files from the PubMedQA GitHub repository and saves them to the specified directory.
"""
repo_url = "https://api.github.com/repos/pubmedqa/pubmedqa/contents/data"
output_dir = "NLP_Project/Pubmedqa_data/data"
os.makedirs(output_dir, exist_ok=True)

response = requests.get(repo_url)
if response.status_code == 200:
    for file_info in response.json():
        file_url = file_info["download_url"]
        file_name = file_info["name"]
        with open(f"{output_dir}/{file_name}", 'wb') as f:
            f.write(requests.get(file_url).content)
    print("All files downloaded successfully!")
else:
    print("Failed to fetch file list.")


#  Merge PubMedQA datasets

In [None]:

"""
Merges two PubMedQA JSON datasets into one, adding ground truth labels.
"""
file1_path = "NLP_Project/Pubmedqa_data/ori_pqal/ori_pqal.json"
file2_path = "NLP_Project/Pubmedqa_data/ori_pqal/test_ground_truth.json"
output_path = "NLP_Project/Pubmedqa_data/ori_pqal.json"

with open(file1_path, "r") as f:
    file1_data = json.load(f)
with open(file2_path, "r") as f:
    file2_data = json.load(f)

for key in file1_data:
    file1_data[key]["ground_truth"] = file2_data.get(key, "None")

with open(output_path, "w") as f:
    json.dump(file1_data, f, indent=4)

print(f"Merged dataset saved to: {output_path}")


# Extract and display PubMedQA fields


In [None]:
"""
Extracts and displays 'QUESTION' and 'ground_truth' fields from the merged PubMedQA dataset.
"""
input_path = "NLP_Project/Pubmedqa_data/ori_pqal.json"
with open(input_path, "r") as f:
    data = json.load(f)

extracted_data = {
    key: {"QUESTION": value["QUESTION"], "ground_truth": value["ground_truth"]}
    for key, value in data.items()
    if "QUESTION" in value and "ground_truth" in value
}

for key, item in extracted_data.items():
    print(f"{key}:")
    print(f"  QUESTION: {item['QUESTION']}")
    print(f"  ground_truth: {item['ground_truth']}")
    print("-" * 50)


# Load and display datasets


In [None]:
"""
Loads and displays the first few rows of MedQA-USMLE, Medical Meadow, and PubMedQA datasets.
"""
medqa_train_path = "NLP_Project/MedQA_USMLE/train/medqa_usmle_train.csv"
medqa_test_path = "NLP_Project/MedQA_USMLE/test/medqa_usmle_test.csv"
medical_meadow_path = "NLP_Project/Medical_Meadow/train/medical_meadow_train.csv"
pubmedqa_ori_pqau_path = "NLP_Project/Pubmedqa_data/ori_pqau.json"

medqa_train = pd.read_csv(medqa_train_path)
print("MedQA-USMLE Train Dataset:")
print(medqa_train.head())

medqa_test = pd.read_csv(medqa_test_path)
print("\nMedQA-USMLE Test Dataset:")
print(medqa_test.head())

medical_meadow = pd.read_csv(medical_meadow_path)
print("\nMedical Meadow Train Dataset:")
print(medical_meadow.head())

with open(pubmedqa_ori_pqau_path, "r") as f:
    pubmedqa_data = json.load(f)
print("\nPubMedQA ori_pqau.json First Entry:")
first_key = list(pubmedqa_data.keys())[0]
print(json.dumps(pubmedqa_data[first_key], indent=4))


# Define text cleaning functions


In [None]:
"""
Defines functions for general text cleaning, dictionary cleaning, and list cleaning.
"""
import re

def clean_text(text):
    """Lowercase, remove special characters, and normalize whitespace."""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip() if text.strip() else "None"

def clean_dict(data_dict):
    """Clean each value in a dictionary using clean_text."""
    return {key: clean_text(value) for key, value in data_dict.items()}

def clean_list(data_list):
    """Clean each item in a list using clean_text."""
    return [clean_text(item) for item in data_list]


# Preprocess MedQA-USMLE dataset


In [None]:
"""
Applies text cleaning to fields in the MedQA-USMLE dataset and saves the preprocessed data.
"""
medqa_path = "NLP_Project/MedQA_USMLE/MedQA_USMLE_dataset.csv"
medqa_df = pd.read_csv(medqa_path)

medqa_df['question'] = medqa_df['question'].apply(clean_text)
medqa_df['options'] = medqa_df['options'].apply(clean_dict)
medqa_df['meta_info'] = medqa_df['meta_info'].apply(clean_text)

print("\nAfter Preprocessing:")
print(medqa_df[['question', 'options', 'answer', 'answer_idx']].head())

output_path = "NLP_Project/Preprocessed/medqa_usmle_preprocessed.csv"
medqa_df.to_csv(output_path, index=False)
print(f"\nPreprocessed MedQA-USMLE dataset saved to: {output_path}")


# Preprocess Medical Meadow dataset


In [None]:
"""
Applies text cleaning to fields in the Medical Meadow dataset and saves the preprocessed data.
"""
medical_meadow_df = pd.read_csv(medical_meadow_path)

medical_meadow_df['input'] = medical_meadow_df['input'].apply(clean_text)
medical_meadow_df['output'] = medical_meadow_df['output'].apply(clean_text)

if 'instruction' in medical_meadow_df.columns:
    medical_meadow_df['instruction'] = medical_meadow_df['instruction'].apply(clean_text)

output_path = "NLP_Project/Preprocessed/medical_meadow_preprocessed.csv"
medical_meadow_df.to_csv(output_path, index=False)
print(f"\nPreprocessed Medical Meadow dataset saved to: {output_path}")


# Preprocess PubMedQA dataset


In [None]:
"""
Applies text cleaning to all relevant fields in the PubMedQA dataset and saves the preprocessed data.
"""
pubmedqa_path = "NLP_Project/Pubmedqa_data/Pubmedqa_data.json"
with open(pubmedqa_path, "r") as f:
    pubmedqa_data = json.load(f)

for key, entry in pubmedqa_data.items():
    entry['QUESTION'] = clean_text(entry.get('QUESTION'))
    entry['CONTEXTS'] = clean_list(entry.get('CONTEXTS'))
    entry['LONG_ANSWER'] = clean_text(entry.get('LONG_ANSWER'))
    entry['LABELS'] = clean_list(entry.get('LABELS'))
    entry['MESHES'] = clean_list(entry.get('MESHES'))
    entry['YEAR'] = clean_text(entry.get('YEAR'))
    entry['reasoning_required_pred'] = clean_text(entry.get('reasoning_required_pred'))
    entry['reasoning_free_pred'] = clean_text(entry.get('reasoning_free_pred'))
    entry['final_decision'] = clean_text(entry.get('final_decision'))
    entry['ground_truth'] = clean_text(entry.get('ground_truth'))

preprocessed_dir = "NLP_Project/Preprocessed"
os.makedirs(preprocessed_dir, exist_ok=True)

output_path = os.path.join(preprocessed_dir, "pubmedqa_preprocessed.json")
with open(output_path, "w") as f:
    json.dump(pubmedqa_data, f, indent=4)

print(f"\nPreprocessed PubMedQA dataset saved to: {output_path}")
