In [16]:
import pandas as pd
import numpy as np

In [21]:
# Import an xml file
from lxml import etree

In [23]:
# Read the XML file
def read_xml(file_path):
    """
    Reads an XML file and returns a DataFrame.
    
    Args:
        file_path (str): Path to the XML file.
        
    Returns:
        pd.DataFrame: DataFrame containing the data from the XML file.
    """
    # Parse the XML file
    tree = etree.parse(file_path)
    root = tree.getroot()
    data = []
    # Iterate through each child element of the root
    for child in root:
        row = {}
        for elem in child:
            row[elem.tag] = elem.text
        data.append(row)
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    return df

In [24]:
# Read the XML file
file_path = '/Users/casey/Documents/GitHub/MedQuAD/TREC-2017-LiveQA-Medical-Test.xml'
df = read_xml(file_path)
# Display the first few rows of the DataFrame
print(df.head())

  Original-Question                                    NIST-PARAPHRASE  \
0          \n\t\t\t  What is the relationship between Noonan syndro...   
1          \n\t\t\t      Do 5 mg. Zolmitriptan tabkets contain gluten?   
2          \n\t\t\t  Are amphetamine salts of 20 mg dosage gluten f...   
3          \n\t\t\t  What are the treatments and precautions for VD...   
4          \n\t\t\t           How much glucagon is in my GlucaGen kit?   

  ANNOTATIONS ReferenceAnswers  
0    \n\t\t\t         \n\t\t\t  
1    \n\t\t\t         \n\t\t\t  
2    \n\t\t\t         \n\t\t\t  
3    \n\t\t\t         \n\t\t\t  
4    \n\t\t\t         \n\t\t\t  


In [25]:
# Count number of rows and columns
num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 104
Number of columns: 4


In [26]:
# Show one full row
print("One full row:")
print(df.iloc[0])
# Display the column names
print("Column names:")
print(df.columns.tolist())
# Display the data types of each column
print("Data types:")

One full row:
Original-Question                                             \n\t\t\t
NIST-PARAPHRASE      What is the relationship between Noonan syndro...
ANNOTATIONS                                                   \n\t\t\t
ReferenceAnswers                                              \n\t\t\t
Name: 0, dtype: object
Column names:
['Original-Question', 'NIST-PARAPHRASE', 'ANNOTATIONS', 'ReferenceAnswers']
Data types:


# Section 2

In [28]:
# Read the XML file
file_path = '/Users/casey/Documents/GitHub/MedQuAD/TREC-2017-LiveQA-Medical-Test-Questions-w-summaries.xml'
df1 = read_xml(file_path)
# Display the first few rows of the DataFrame
print(df1.head())

  Original-Question                                    NIST-PARAPHRASE  \
0          \n\t\t\t  What is the relationship between Noonan syndro...   
1          \n\t\t\t      Do 5 mg. Zolmitriptan tabkets contain gluten?   
2          \n\t\t\t  Are amphetamine salts of 20 mg dosage gluten f...   
3          \n\t\t\t  What are the treatments and precautions for VD...   
4          \n\t\t\t           How much glucagon is in my GlucaGen kit?   

                                         NLM-Summary ANNOTATIONS  \
0  What is the relationship between Noonan syndro...    \n\t\t\t   
1   Do Zolmitriptan 5mg tablets manufactured by G...    \n\t\t\t   
2  Do amphetamine salts 20mg tablets contain gluten?    \n\t\t\t   
3  What are the treatments and precautions for VD...    \n\t\t\t   
4  How much glucagon is in the GlucaGen HypoKit a...    \n\t\t\t   

  ReferenceAnswers  
0         \n\t\t\t  
1         \n\t\t\t  
2         \n\t\t\t  
3         \n\t\t\t  
4         \n\t\t\t  


In [29]:
# Count number of rows and columns
num_rows, num_cols = df1.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 104
Number of columns: 5


In [30]:
# Show one full row
print("One full row:")
print(df.iloc[0])

One full row:
Original-Question                                             \n\t\t\t
NIST-PARAPHRASE      What is the relationship between Noonan syndro...
NLM-Summary          What is the relationship between Noonan syndro...
ANNOTATIONS                                                   \n\t\t\t
ReferenceAnswers                                              \n\t\t\t
Name: 0, dtype: object


In [31]:
# First NIST-PARAPHRASE 
print("First NIST-PARAPHRASE:")
print(df1.iloc[0]['NIST-PARAPHRASE'])

First NIST-PARAPHRASE:
What is the relationship between Noonan syndrome and polycystic renal disease?


In [32]:
# First NLM-Summary
print("First NLM-Summary:")
print(df1.iloc[0]['NLM-Summary'])   

First NLM-Summary:
What is the relationship between Noonan syndrome and polycystic renal disease?


# Section 3

In [33]:
# Import txt file /Users/casey/Documents/GitHub/MedQuAD/TREC-2017-LiveQA-Medical-qrels-NIST-692.txt
def read_txt(file_path):
    """
    Reads a text file and returns a DataFrame.
    
    Args:
        file_path (str): Path to the text file.
        
    Returns:
        pd.DataFrame: DataFrame containing the data from the text file.
    """
    # Read the text file into a DataFrame
    df = pd.read_csv(file_path, sep="\t", header=None)
    return df
# Read the text file
file_path = '/Users/casey/Documents/GitHub/MedQuAD/TREC-2017-LiveQA-Medical-qrels-NIST-692.txt'
df2 = read_txt(file_path)

In [34]:
# Display the first few rows of the DataFrame
print(df2.head())

                                                   0
0  1 3 Noonan syndrome is a relatively common aut...
1  1 3 Noonan syndrome is a relatively common aut...
2  1 1 My grandson has Noonan and also albino. He...
3  1 3 Background: Noonan syndrome was first reco...
4  1 2 The syndrome is named after Dr Jacqueline ...


In [35]:
# Count number of rows and columns
num_rows, num_cols = df2.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 692
Number of columns: 1


In [36]:
# Show one full row
print("One full row:")
print(df2.iloc[0])

One full row:
0    1 3 Noonan syndrome is a relatively common aut...
Name: 0, dtype: object


# Mashqa_data

In [37]:
# Import the json file
import json
# Read the JSON file
def read_json(file_path):
    """
    Reads a JSON file and returns a DataFrame.
    
    Args:
        file_path (str): Path to the JSON file.
        
    Returns:
        pd.DataFrame: DataFrame containing the data from the JSON file.
    """
    # Read the JSON file into a DataFrame
    df = pd.read_json(file_path)
    return df
# Read the JSON file
file_path = '/Users/casey/Documents/GitHub/MedQuAD/test_webmd_squad_v2_full.json'
df3 = read_json(file_path)

In [38]:
# Display the first few rows of the DataFrame
print(df3.head())

   version                                               data
0        2  {'title': 'https://www.webmd.com/a-to-z-guides...
1        2  {'title': 'https://www.webmd.com/cancer/polycy...
2        2  {'title': 'https://www.webmd.com/vitamins-and-...
3        2  {'title': 'https://www.webmd.com/cholesterol-m...
4        2  {'title': 'https://www.webmd.com/arthritis/cli...


In [39]:
# Count number of rows and columns
num_rows, num_cols = df3.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 521
Number of columns: 2
