<a href="https://colab.research.google.com/github/AryanPatial/HealthAdvisor-AI/blob/main/NLP_PROJECT_7_Data_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#STEP 0 DATA COLLECTION

In [23]:
import pandas as pd
import numpy as np


In [1]:
!git clone https://github.com/abachaa/MedQuAD.git


Cloning into 'MedQuAD'...
remote: Enumerating objects: 11310, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 11310 (delta 7), reused 5 (delta 5), pack-reused 11300 (from 1)[K
Receiving objects: 100% (11310/11310), 11.01 MiB | 6.12 MiB/s, done.
Resolving deltas: 100% (6807/6807), done.
Updating files: 100% (11277/11277), done.


In [2]:
import os
os.chdir('/content/MedQuAD')


In [3]:
os.listdir('/content/MedQuAD')


['readme.txt',
 '10_MPlus_ADAM_QA',
 '11_MPlusDrugs_QA',
 '6_NINDS_QA',
 '3_GHR_QA',
 '.git',
 '5_NIDDK_QA',
 '12_MPlusHerbsSupplements_QA',
 'QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip',
 '4_MPlus_Health_Topics_QA',
 '2_GARD_QA',
 '1_CancerGov_QA',
 'LICENSE.txt',
 '9_CDC_QA',
 '7_SeniorHealth_QA',
 '8_NHLBI_QA_XML']

In [4]:
qa_folders = [folder for folder in os.listdir() if os.path.isdir(folder)]
print(qa_folders)  # Check the folder names


['10_MPlus_ADAM_QA', '11_MPlusDrugs_QA', '6_NINDS_QA', '3_GHR_QA', '.git', '5_NIDDK_QA', '12_MPlusHerbsSupplements_QA', '4_MPlus_Health_Topics_QA', '2_GARD_QA', '1_CancerGov_QA', '9_CDC_QA', '7_SeniorHealth_QA', '8_NHLBI_QA_XML']


#Extracting URLs from the XML files

In [5]:
import xml.etree.ElementTree as ET
import os

# Function to extract URLs from an individual XML file
def extract_url_from_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract URL from the 'Document' element's 'url' attribute
    url = root.attrib.get('url', None)

    # If a URL is found, return it
    if url:
        return url
    return None

# Loop through all the QA folders and extract URLs
def extract_urls_from_qa_folders(base_path='/content/MedQuAD'):
    all_urls = []

    # List all the folders (e.g., '10_MPlus_ADAM_QA', etc.)
    qa_folders = [folder for folder in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, folder))]

    for folder in qa_folders:
        folder_path = os.path.join(base_path, folder)
        for xml_file in os.listdir(folder_path):
            if xml_file.endswith('.xml'):
                file_path = os.path.join(folder_path, xml_file)
                url = extract_url_from_xml(file_path)
                if url:
                    all_urls.append(url)

    return all_urls

# Example: Get all URLs from the QA folders
all_urls = extract_urls_from_qa_folders('/content/MedQuAD')
print(all_urls[:5])  # Print the first 5 URLs to verify


['https://www.nlm.nih.gov/medlineplus/ency/patientinstructions/000578.htm', 'https://www.nlm.nih.gov/medlineplus/ency/article/000878.htm', 'https://www.nlm.nih.gov/medlineplus/ency/patientinstructions/000864.htm', 'https://www.nlm.nih.gov/medlineplus/ency/article/001625.htm', 'https://www.nlm.nih.gov/medlineplus/ency/article/003764.htm']


In [6]:
len(all_urls)

11274

#we want to work with 2000 different xml file for step 0 to collect the data that we will be choosing at random, so what we can do is that since we have all the urls now we can just use random sampling and choose 2000 urls at random and scrap those.

In [7]:
import random

# Randomly select 2000 URLs from the list of all URLs
sampled_urls = random.sample(all_urls, 2000)

# Verify by printing the first 5 sampled URLs
print(sampled_urls[:5])


['https://www.nlm.nih.gov/medlineplus/druginfo/meds/a682821.html', 'https://www.nlm.nih.gov/medlineplus/ency/article/000769.htm', 'https://www.nlm.nih.gov/medlineplus/druginfo/meds/a601006.html', 'https://www.nlm.nih.gov/medlineplus/druginfo/meds/a616009.html', 'https://www.nlm.nih.gov/medlineplus/diets.html']


In [31]:
len(sampled_urls)

2000

# Scraping

In [11]:
import requests
from bs4 import BeautifulSoup
import time

# Function to extract main content from a single URL
def extract_main_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        article_tag = soup.find('article')  # Extract content from <article>
        return article_tag.get_text(strip=True) if article_tag else "No <article> tag found."
    except requests.exceptions.RequestException as e:
        return f"Error fetching {url}: {e}"

# Function to scrape multiple URLs
def scrape_urls(urls, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for url in urls:
            print(f"Scraping {url}...")
            content = extract_main_content(url)
            file.write(f"URL: {url}\nContent: {content}\n\n")
            time.sleep(1)  # Delay to prevent overloading servers

# Main script
if __name__ == "__main__":
    sampled_urls = sampled_urls  # Replace with your pre-selected 2000 URLs
    output_file = "scraped_data.txt"

    scrape_urls(sampled_urls, output_file)
    print(f"Scraping completed. Data saved to {output_file}.")


Scraping https://www.nlm.nih.gov/medlineplus/druginfo/meds/a682821.html...
Scraping https://www.nlm.nih.gov/medlineplus/ency/article/000769.htm...
Scraping https://www.nlm.nih.gov/medlineplus/druginfo/meds/a601006.html...
Scraping https://www.nlm.nih.gov/medlineplus/druginfo/meds/a616009.html...
Scraping https://www.nlm.nih.gov/medlineplus/diets.html...
Scraping https://rarediseases.info.nih.gov/gard/7578/richter-syndrome...
Scraping https://www.nlm.nih.gov/medlineplus/ency/article/002137.htm...
Scraping https://www.nlm.nih.gov/medlineplus/ency/patientinstructions/000346.htm...
Scraping https://rarediseases.info.nih.gov/gard/12943/x-linked-hypophosphatemia...
Scraping https://www.nlm.nih.gov/medlineplus/druginfo/meds/a608018.html...
Scraping https://www.nlm.nih.gov/medlineplus/traumaticbraininjury.html...
Scraping https://www.nlm.nih.gov/medlineplus/ency/patientinstructions/000857.htm...
Scraping https://www.nlm.nih.gov/medlineplus/ency/patientinstructions/000373.htm...
Scraping https:

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Converting txt file to JSON, cause im too lazy to check what im writing in my code.

In [17]:
import json

# Input and output file paths
txt_file = "scraped_data.txt"
json_file = "scraped_data.json"

# List to store extracted data
data = []

# Read the text file and convert to JSON format
with open(txt_file, "r", encoding="utf-8") as file:
    lines = file.readlines()

current_entry = {}

for line in lines:
    line = line.strip()  # Remove leading/trailing spaces
    if line.startswith("URL: "):
        if current_entry:
            data.append(current_entry)  # Store previous entry before starting a new one
        current_entry = {"url": line.replace("URL: ", "")}  # Extract URL
    elif line.startswith("Content: "):
        current_entry["content"] = line.replace("Content: ", "")  # Extract content

# Add the last entry
if current_entry:
    data.append(current_entry)

# Write to JSON file
with open(json_file, "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print(f"Conversion completed. Data saved to {json_file}.")


Conversion completed. Data saved to scraped_data.json.


In [20]:
import json

json_file = "/content/drive/My Drive/scraped_data.json"  # Save directly to Google Drive

with open(json_file, "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print(f"JSON file saved to Google Drive: {json_file}")


JSON file saved to Google Drive: /content/drive/My Drive/scraped_data.json


In [18]:
import json

json_file = "scraped_data.json"

with open(json_file, "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON data
print(json.dumps(data[:5], indent=4, ensure_ascii=False))  # Shows first 5 entries


[
    {
        "url": "https://www.nlm.nih.gov/medlineplus/druginfo/meds/a682821.html",
        "content": "Thiotepa Injectionpronounced as (thye'' oh tep' a)To use the sharing features on this page, please enable JavaScript.Why is this medication prescribed?How should this medicine be used?Other uses for this medicineWhat special precautions should I follow?What special dietary instructions should I follow?What side effects can this medication cause?In case of emergency/overdoseWhat other information should I know?Brand namesWhy is this medication prescribed?Thiotepa is used to treat certain types of ovarian cancer (cancer that begins in the female reproductive organs where eggs are formed), breast, and bladder cancer. It is also used to treat malignant effusions (a condition when fluid collects in the lungs or around the heart) that are caused by cancerous tumors. Thiotepa is in a class of medications called alkylating agents. It works by slowing or stopping the growth of cancer cel

In [24]:
df = pd.DataFrame(data)

In [25]:
df.head()

Unnamed: 0,url,content
0,https://www.nlm.nih.gov/medlineplus/druginfo/m...,Thiotepa Injectionpronounced as (thye'' oh tep...
1,https://www.nlm.nih.gov/medlineplus/ency/artic...,Metastatic brain tumorTo use the sharing featu...
2,https://www.nlm.nih.gov/medlineplus/druginfo/m...,Hydrocodone Combination Productspronounced as ...
3,https://www.nlm.nih.gov/medlineplus/druginfo/m...,"Necitumumab Injectionpronounced as (ne"" si too..."
4,https://www.nlm.nih.gov/medlineplus/diets.html,DietsOn this pageBasicsSummaryStart HereLearn ...


In [30]:
import pandas as pd

# Load your DataFrame (assuming 'content' is the column name)
total_words = df['content'].str.split().str.len().sum()

print(f"Total number of words in 'content' column: {total_words}")


Total number of words in 'content' column: 1219515
