**Installation of Pre-Requisite Libraries**

In [None]:
!pip install pandas requests beautifulsoup4 scikit-learn numpy lxml

**Scraping Using BeautifulSoup**

Here we scrapped Mayo Clinic Diseases by Alphabet Letter and Save to a CSV file

This script iterates through all letters A-Z and scrapes disease names and their URLs from the Mayo Clinic website. For each letter, it fetches the corresponding diseases page, extracts the disease links, and writes the results (a letter, disease name, and URL) to a CSV file.

In [None]:
import requests
from bs4 import BeautifulSoup
from csv import writer
import string


with open('new_mayo_clinic_diseases.csv', 'w', newline='', encoding='utf-8') as f:
    csv_writer = writer(f)
    csv_writer.writerow(['Letter', 'Disease Name','URL'])


    for letter in string.ascii_uppercase:
        while True:
            url = f"https://www.mayoclinic.org/diseases-conditions/index?letter={letter}"
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            disease_links = soup.find_all('a', class_='cmp-anchor--plain cmp-button cmp-button__link cmp-result-name__link')

            if not disease_links:
                break

            for link in disease_links:
                url1 = link.get('href')
                disease_name = link.text.strip()
                csv_writer.writerow([letter, disease_name,url1])
                print(f"Letter {letter} - Added: {disease_name} and URL")

            next_page = soup.find('a', {'aria-label': 'Next page'})
            if not next_page:
                break

print("Scraping completed successfully!")


**Getting the Symptoms**

Scrapping the associated symptom of each disease from the csv file and using the associated URL to create an new CSV.

In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

df = pd.read_csv('new_mayo_clinic_diseases.csv')

def scrape_symptoms(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        symptoms_header = soup.find('h2', string='Symptoms')
        if symptoms_header:
            symptoms_text = symptoms_header.find_next('ul').get_text(separator=' ', strip=True)
            return symptoms_text
        return ""
    except:
        return ""


df['Symptoms'] = df['URL'].apply(scrape_symptoms)
df.to_csv('diseases_with_symptoms.csv', index=False)

**Adding Rarity**

Adding additional extra rare criteria which will give the disease lesser probability to occur.

In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

df = pd.read_csv('new_mayo_clinic_diseases.csv')

def scrape_symptoms_and_rarity(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        symptoms_header = soup.find('h2', string='Symptoms')
        overview_header = soup.find('h2', string='Overview')

        symptoms = []
        if symptoms_header:
            for li in symptoms_header.find_next('ul').find_all('li'):
                symptoms.append(li.get_text(strip=True))

        if overview_header and symptoms_header:
            content = ''
            current = overview_header.find_next()
            while current and current != symptoms_header:
                if current.name == 'p':
                    content += current.get_text() + ' '
                current = current.find_next()
            if re.search(r'\b(rare|uncommon|rarely)\b', content.lower()):
                is_rare = 1

        return symptoms, is_rare
    except:
        return [], 0

df[['Symptoms', 'IsRare']] = df['URL'].apply(
    lambda x: pd.Series(scrape_symptoms_and_rarity(x))
)
df.to_csv('prefinal_diseases_with_symptoms_enhanced.csv', index=False)
