In [None]:
!pip install -U crawl4ai

In [None]:
!crawl4ai-setup

In [None]:
import os

In [None]:
!crwl https://www.1mg.com/diseases/acidity-42 -o markdown #checking the crawler


In [None]:
import requests
from bs4 import BeautifulSoup

#main diseases page
base_url = "https://www.1mg.com/all-diseases"

response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

#scraping disease names from the 1mg disease A page
disease_links = soup.find_all("a", href=True)

#disease links
disease_names = []
for link in disease_links:
    href = link['href']
    if '/diseases/' in href:
        # base url for A page:- https://www.1mg.com/diseases/
        disease_id = href.split("/diseases/")[-1]
        disease_names.append(disease_id)


print(disease_names[:10])


['acidity-42', 'acne-261', 'addisons-disease-120', 'airplane-ear-970', 'allergic-conditions-10', 'alzheimers-disease-179', 'amenorrhea-467', 'anal-fissure-74', 'anaphylaxis-947', 'anorexia-nervosa-928']


In [None]:
disease_names

In [None]:
import string
label_list = list(string.ascii_lowercase[1:]) #creating a list from B to Z to scrape disease from B to Z

In [None]:
for i in range(len(label_list)):
  base_url = f"https://www.1mg.com/all-diseases?label={label_list[i]}"  #base url for every other disease index apart from A

  response = requests.get(base_url)
  soup = BeautifulSoup(response.content, "html.parser")
  disease_links = soup.find_all("a", href=True)
  #
  for link in disease_links:
    href = link['href']
    if '/diseases/' in href:
       #disease ID
       disease_id = href.split("/diseases/")[-1]
       disease_names.append(disease_id)

print(disease_names[:10])


In [None]:
disease_names

# CRAWL4AI

In [None]:
import os
import re
import json
from langchain.schema import Document
from subprocess import run

In [None]:
base_url = "https://www.1mg.com/diseases/"
output_dir = "markdown_output"

#create directory
os.makedirs(output_dir, exist_ok=True)

for disease in disease_names:
    url = base_url + disease
    filename = os.path.join(output_dir, f"{disease}.md")

    print(f"Crawling full page for: {url}")
    try:
        run(f"crwl crawl {url} -o markdown -O {filename}", shell=True, check=True) #crawling markdown of the disease pages
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")


In [None]:
def extract_sections_from_markdown(md_text):
    required_sections = [
        "Overview", "Key Facts", "Symptoms", "Causes", "Risk factors",
        "Diagnosis", "Prevention",
        "Treatment", "Home-care", "Complications"
    ]

    extracted = {}
    # extracting top headers (##)
    headers = list(re.finditer(r'^##\s+(.*?)\s*$', md_text, re.MULTILINE))

    for i, match in enumerate(headers):
        title = match.group(1).strip().lower()
        start = match.end()
        end = headers[i + 1].start() if i + 1 < len(headers) else len(md_text)
        content = md_text[start:end].strip()

        for required in required_sections:
            if required.lower() in title:
                extracted[required] = content
                break

    return extracted

In [None]:
markdown_folder = "markdown_output"
json_output_folder = "structured_json"
os.makedirs(json_output_folder, exist_ok=True)

for fname in os.listdir(markdown_folder):
    if not fname.endswith(".md"):
        continue

    with open(os.path.join(markdown_folder, fname), "r", encoding="utf-8") as f:  #converting markdowns to json for further processing step
        md_content = f.read()

    extracted_data = extract_sections_from_markdown(md_content)

    out_path = os.path.join(json_output_folder, fname.replace(".md", ".json"))
    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(extracted_data, out_f, indent=2)

    print(f"Extracted: {fname} ΓåÆ {out_path}")


## For Rag

In [None]:
def clean_text(text):

    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) #remove markdowns
    text = re.sub(r'[*_]{1,3}([^*_]+)[*_]{1,3}', r'\1', text) #remove text markers
    text = re.sub(r'^\s*[-*]\s*', '', text, flags=re.MULTILINE) #remove asterisks scattered to emphasise words in the markdwon file
    text = re.sub(r'\n','', text) #remove nextlines
    text = re.sub(r'#','', text) #remove subheader symbols
    text = re.sub(r'[ \t]+', ' ', text) #remove extra space

    return text.strip()


In [None]:
instruction_map = {
    "Overview": "What is {disease}?",
    "Key Facts": "What are some key facts about {disease}?",
    "Symptoms": "What are the symptoms of {disease}?",
    "Causes": "What causes {disease}?",
    "Risk factors": "What are the risk factors for {disease}?",
    "Diagnosis": "How is {disease} diagnosed?",
    "Treatment": "How is {disease} treated?",
    "Home-care": "What are the home remedies and care tips for {disease}?",
    "Complications": "What complications can arise from {disease}?",
    "Prevention": "How can {disease} be prevented?",
}

def extract_disease_name(filename):  #removing _, -, numbers from the disease names
    name = os.path.splitext(filename)[0]
    name = re.sub(r'-\d+$', '', name)
    name = name.replace('-', ' ')
    return name.title()

def load_rag_documents(json_dir):
    documents = []
    for fname in os.listdir(json_dir):
        if not fname.endswith(".json"):
            continue
        with open(os.path.join(json_dir, fname), "r", encoding="utf-8") as f:
            data = json.load(f)

        disease_name = extract_disease_name(fname)

        for section, content in data.items():
            if section in instruction_map and content.strip():
                doc = Document(
                    page_content=clean_text(content.strip()),
                    metadata={
                        "disease": disease_name,
                        "section": section
                    }
                )
                documents.append(doc)
    return documents


In [None]:
documents = load_rag_documents("structured_json")

In [None]:
documents

In [None]:
#saving the document as json to use it without constraint
documents_data = [
    {"page_content": doc.page_content, "metadata": doc.metadata}
    for doc in documents
]

with open("1mgrag_documents.json", "w", encoding="utf-8") as f:
    json.dump(documents_data, f, ensure_ascii=False, indent=2)


## Medicine

In [None]:
import requests
from bs4 import BeautifulSoup

#main diseases page
base_url = "https://www.1mg.com/drugs-all-medicines"

response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

#scraping disease names from the 1mg disease A page
drug_links = soup.find_all("a", href=True)

#disease links
drug_names = []
for link in drug_links:
    href = link['href']
    if '/drugs/' in href:
        # base url for A page
        disease_id = href.split("/drugs/")[-1]
        drug_names.append(disease_id)


print(drug_names[:10])


['avastin-100mg-injection-135666', 'actorise-40-injection-227647', 'actorise-25-injection-228329', 'actorise-60-injection-403206', 'azel-80-capsule-682932', 'avastin-400mg-injection-341311', 'azel-40mg-capsule-429353', 'actorise-200-injection-342162', 'actorise-100-injection-369472', 'azacytin-injection-341892']


In [None]:
import string
label_list = list(string.ascii_lowercase[1:]) #creating a list from B to Z to scrape disease from B to Z

In [None]:
for i in range(len(label_list)):

  #main diseases page
  base_url = f"https://www.1mg.com/drugs-all-medicines?label={label_list[i]}"

  response = requests.get(base_url)
  soup = BeautifulSoup(response.content, "html.parser")

  #scraping disease names from the 1mg disease A page
  drug_links = soup.find_all("a", href=True)

  #disease links
  for link in drug_links:
      href = link['href']
      if '/drugs/' in href:
          # base url for A page
          disease_id = href.split("/drugs/")[-1]
          drug_names.append(disease_id)


print(drug_names[:10])


['avastin-100mg-injection-135666', 'actorise-40-injection-227647', 'actorise-25-injection-228329', 'actorise-60-injection-403206', 'azel-80-capsule-682932', 'avastin-400mg-injection-341311', 'azel-40mg-capsule-429353', 'actorise-200-injection-342162', 'actorise-100-injection-369472', 'azacytin-injection-341892']


In [None]:
len(drug_names)

780

In [None]:
import os
import re
import json
from langchain.schema import Document
from subprocess import run

In [None]:
base_url = base_url = "https://www.1mg.com/drugs/"
output_dir = "markdown_output"

#create directory
os.makedirs(output_dir, exist_ok=True)

for drug in drug_names:
    url = base_url + drug
    filename = os.path.join(output_dir, f"{drug}.md")

    print(f"Crawling full page for: {url}")
    try:
        run(f"crwl crawl {url} -o markdown -O {filename}", shell=True, check=True) #crawling markdown of the disease pages
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")


In [None]:
import os
import re
import json


In [None]:
def extract_drug_sections(md_text):
    section_titles = [
    "Product introduction",
    "Uses",
    "How to use",
    "How drug works",
    "Side effects",
    "Safety advice",
    "Interaction with drugs",
    "FAQs"
    ]

    extracted = {}
    headers = list(re.finditer(r'^##\s+(.*?)\s*$', md_text, re.MULTILINE))

    for i, match in enumerate(headers):
        raw_title = match.group(1).strip()
        clean_title = raw_title.lower()
        start = match.end()
        end = headers[i + 1].start() if i + 1 < len(headers) else len(md_text)
        content = md_text[start:end].strip()

        for section in section_titles:
            if section.lower() in clean_title:
                extracted[section] = content
                break

    return extracted

In [None]:
def clean_text(text):
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)  # Remove markdown links
    text = re.sub(r'[*_]{1,3}([^*_]+)[*_]{1,3}', r'\1', text)  # Remove *bold*/_italic_
    text = re.sub(r'^[-*]\s+', '', text, flags=re.MULTILINE)  # List bullets
    text = re.sub(r'\n+', ' ', text)  # Collapse newlines
    text = re.sub(r'#', '', text)  # Remove any stray headers
    text = re.sub(r'[ \t]+', ' ', text)  # Remove extra whitespace
    return text.strip()


In [None]:
markdown_folder = "markdown_output"
json_output_folder = "drug_json"
os.makedirs(json_output_folder, exist_ok=True)

for fname in os.listdir(markdown_folder):
    if not fname.endswith(".md"):
        continue

    with open(os.path.join(markdown_folder, fname), "r", encoding="utf-8") as f:
        md_content = f.read()

    extracted = extract_drug_sections(md_content)
    cleaned = {k: clean_text(v) for k, v in extracted.items()}

    out_path = os.path.join(json_output_folder, fname.replace(".md", ".json"))
    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(cleaned, out_f, indent=2)

    print(f"Extracted: {fname} ΓåÆ {out_path}")


In [None]:
def extract_drug_sections(md_text):
    section_titles = [
        "Product introduction", "Uses", "How to use", "How drug works",
        "Side effects", "Safety advice", "Interaction with drugs", "FAQs"
    ]

    extracted = []
    headers = list(re.finditer(r'^##\s+(.*?)\s*$', md_text, re.MULTILINE))

    for i, match in enumerate(headers):
        title = match.group(1).strip()
        title_clean = title.lower()
        start = match.end()
        end = headers[i + 1].start() if i + 1 < len(headers) else len(md_text)
        content = md_text[start:end].strip()

        for section in section_titles:
            if section.lower() in title_clean:
                extracted.append((section, content))
                break

    return extracted


def clean_text(text):
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    text = re.sub(r'[*_]{1,3}([^*_]+)[*_]{1,3}', r'\1', text)
    text = re.sub(r'^[-*]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()


In [None]:
markdown_folder = "markdown_output"
output_path = "drug_rag_documents.json"

rag_entries = []

for fname in os.listdir(markdown_folder):
    if not fname.endswith(".md"):
        continue

    with open(os.path.join(markdown_folder, fname), "r", encoding="utf-8") as f:
        md_content = f.read()

    drug_name = fname.replace(".md", "").replace("-", " ").title()
    sections = extract_drug_sections(md_content)

    for section_title, raw_content in sections:
        cleaned_content = clean_text(raw_content)
        rag_entries.append({
            "page_content": cleaned_content,
            "metadata": {
                "drug": drug_name,
                "section": section_title
            }
        })

# Save as single JSON file
with open(output_path, "w", encoding="utf-8") as out_f:
    json.dump(rag_entries, out_f, indent=2)

print(f"Generated {len(rag_entries)} drug RAG entries ΓåÆ {output_path}")


Generated 4951 drug RAG entries ΓåÆ drug_rag_documents.json


In [None]:
output_file = "combined_rag_documents.jsonl"
file1 = "1mgrag_documents.json"
file2 = "drug_rag_documents.json"

with open(output_file, 'w', encoding='utf-8') as outfile:
    # Read and write data from file1
    with open(file1, 'r', encoding='utf-8') as infile1:
        data1 = json.load(infile1)
        for entry in data1:
            json.dump(entry, outfile, ensure_ascii=False)
            outfile.write('\n')

    # Read and write data from file2
    with open(file2, 'r', encoding='utf-8') as infile2:
        data2 = json.load(infile2)
        for entry in data2:
            json.dump(entry, outfile, ensure_ascii=False)
            outfile.write('\n')

print(f"Combined {file1} and {file2} into {output_file}")


Combined 1mgrag_documents.json and drug_rag_documents.json into combined_rag_documents.jsonl
