**Import Key Libraries**

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import re
from datetime import datetime
from selenium.webdriver.common.keys import Keys 

### Step1: Get all specialty Links

In [14]:
driver = webdriver.Chrome()  # Or webdriver.Firefox()
driver.get('https://www.maxhealthcare.in/frequently-asked-questions')
time.sleep(3)  # Wait for JS to load

soup = BeautifulSoup(driver.page_source, 'html.parser')
faq_section = soup.find('section', class_='faq-list-section')
if not faq_section:
    faq_section = soup

specialty_links = []
for a in faq_section.select('a[href*="/our-specialities/"][href*="/faqs"]'):
    href = a['href']
    if href.startswith('/'):
        href = 'https://www.maxhealthcare.in' + href
    specialty = a.get_text(strip=True)
    if specialty and href:
        specialty_links.append((specialty, href))

specialty_links = list(set(specialty_links))
for specialty, link in specialty_links:
    print(f"{specialty}: {link}")

driver.quit()


Physiotherapy: https://www.maxhealthcare.in/our-specialities/physiotherapy/faqs
Aesthetics and Reconstructive: https://www.maxhealthcare.in/our-specialities/aesthetics-and-reconstructive/faqs
Pulmonology: https://www.maxhealthcare.in/our-specialities/pulmonology/faqs
Health and Wellness: https://www.maxhealthcare.in/our-specialities/health-and-wellness/faqs
Podiatry: https://www.maxhealthcare.in/our-specialities/podiatry/faqs
Cancer: https://www.maxhealthcare.in/our-specialities/cancer/faqs
Obstetrics And Gynaecology: https://www.maxhealthcare.in/our-specialities/obstetrics-and-gynaecology/faqs
Minimal Access / laparoscopic Surgery: https://www.maxhealthcare.in/our-specialities/minimal-access-laparoscopic-surgery/faqs
Internal Medicine: https://www.maxhealthcare.in/our-specialities/internal-medicine/faqs
Mental Health and Behavioral: https://www.maxhealthcare.in/our-specialities/mental-health-and-behavioral/faqs
Nephrology: https://www.maxhealthcare.in/our-specialities/nephrology/faqs


### Step2: Get Individual product information

In [16]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime


session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} -----------------------------> ")

driver = webdriver.Chrome()
faq_rows = []
failed_specialty_links = []
success_count = 0
fail_count = 0

for specialty, link in specialty_links:
    try:
        driver.get(link)
        WebDriverWait(driver, 15).until(lambda d: d.execute_script('return document.readyState') == 'complete')
        time.sleep(2)

        faq_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.card')
        if not faq_blocks:
            print(f"No FAQ blocks found for: {specialty}")
            failed_specialty_links.append(link)
            continue

        for block in faq_blocks:
            try:
                # Question in <button> (strip html for image etc)
                question_elem = block.find_element(By.TAG_NAME, 'button')
                # Remove text from any children inside <button> (image, icon), just use .text
                question = question_elem.text.strip()

                # Answer in <div class="site-content ...">
                answer_elem = block.find_element(By.CSS_SELECTOR, 'div.site-content')
                answer = answer_elem.text.strip()
                # If answer has lots of whitespace, normalize
                answer = ' '.join(answer.split())

                faq_rows.append([
                    specialty, link, question, answer
                ])
            except Exception as inner_e:
                continue

        success_count += 1
        print(f"URL {success_count} completed: {link}")
    except Exception as e:
        print(f"Failed for {link}: {e}")
        failed_specialty_links.append(link)
        fail_count += 1
        print(f"Failed URL Count {fail_count}")

df_faq = pd.DataFrame(faq_rows, columns=['specialty', 'link', 'question', 'answer'])
df_faq.to_csv('maxhealthcare_faqs.csv', index=False)

df_failed = pd.DataFrame(failed_specialty_links, columns=['failed_link'])
df_failed.to_csv('failed_specialty_links.csv', index=False)

print("Total specialty pages processed: ", len(specialty_links))
print("Successful pages: ", success_count)
print("Failed pages: ", fail_count)
print("Total Q&A Pairs: ", len(df_faq))

driver.quit()
session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} -----------------------------> ")


Session Start Time: 21:39:57.908678 -----------------------------> 
URL 1 completed: https://www.maxhealthcare.in/our-specialities/physiotherapy/faqs
URL 2 completed: https://www.maxhealthcare.in/our-specialities/aesthetics-and-reconstructive/faqs
URL 3 completed: https://www.maxhealthcare.in/our-specialities/pulmonology/faqs
URL 4 completed: https://www.maxhealthcare.in/our-specialities/health-and-wellness/faqs
URL 5 completed: https://www.maxhealthcare.in/our-specialities/podiatry/faqs
URL 6 completed: https://www.maxhealthcare.in/our-specialities/cancer/faqs
URL 7 completed: https://www.maxhealthcare.in/our-specialities/obstetrics-and-gynaecology/faqs
URL 8 completed: https://www.maxhealthcare.in/our-specialities/minimal-access-laparoscopic-surgery/faqs
URL 9 completed: https://www.maxhealthcare.in/our-specialities/internal-medicine/faqs
URL 10 completed: https://www.maxhealthcare.in/our-specialities/mental-health-and-behavioral/faqs
URL 11 completed: https://www.maxhealthcare.in/ou

In [41]:
df = pd.read_csv("maxhealthcare_faqs.csv")
df

Unnamed: 0,specialty,link,question,answer
0,Physiotherapy,https://www.maxhealthcare.in/our-specialities/...,What can I expect at my first appointment?,Physiotherapy is a rehabilitation program that...
1,Physiotherapy,https://www.maxhealthcare.in/our-specialities/...,What can a physical therapist do?,A physical therapist is a highly-trained medic...
2,Physiotherapy,https://www.maxhealthcare.in/our-specialities/...,What are the treatment options available?,"Depending on your condition, there are several..."
3,Physiotherapy,https://www.maxhealthcare.in/our-specialities/...,Which physiotherapy technique will you use?,"As is with any course of treatment, the type o..."
4,Physiotherapy,https://www.maxhealthcare.in/our-specialities/...,What should I do for nerve pain?,Nerve pain is due to a pinched nerve or damage...
...,...,...,...,...
396,Bone Marrow Transplant,https://www.maxhealthcare.in/our-specialities/...,Will I ever be able to lead a normal life?,"Yes, patient can return to his normal routine ..."
397,Bone Marrow Transplant,https://www.maxhealthcare.in/our-specialities/...,Are there any complications?,You can experience: • Graft versus host diseas...
398,Bone Marrow Transplant,https://www.maxhealthcare.in/our-specialities/...,Do I require continuous follow-ups?,It depends upon the disease and type of transp...
399,Bone Marrow Transplant,https://www.maxhealthcare.in/our-specialities/...,What if my patient does not have a matched sib...,If the patient does not have a matched sibling...


In [42]:
import re
import html

def preprocess(raw_answer):
    # Replace non-standard bullet (\uf0d8) with '\n- '
    text = raw_answer.replace('\uf0d8', '\n- ')
    # Decode HTML entities
    text = html.unescape(text)
    # Convert curly apostrophes/quotes and other unicode punctuation to ASCII
    replacements = {
        '’': "'", '‘': "'",   # apostrophes
        '“': '"', '”': '"',  # double quotes
        '–': '-', '—': '-',  # dashes
        '…': '...',          # ellipsis
        '•': '-',            # bullet
        '–': '-', '—': '-',  # more dashes
        '\u201c': '"', '\u201d': '"',
        '\u2018': "'", '\u2019': "'",
        '\u2022': '-',       # another bullet
    }
    for orig, sub in replacements.items():
        text = text.replace(orig, sub)
    # Remove excess whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*', '\n', text).strip()
    # Optional: Split lists
    text = re.sub(r"^[:'\"]+\s*", "", text)
    return text


df["answer"] = df["answer"].astype(str).apply(preprocess)


In [51]:
df.to_csv("final.csv", index=False)

#### Static data for 3rd part

In [52]:
csv_data = """name,department,doctor,designation,working_days,working_hours,break,appointment_booking,contact_number,address,website,emergency_number
IITGN Medical Center,General,Dr. Deepa Shah,Medical Consultant,Mon-Fri,13:30-16:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,General,Dr. Bhavesh Panchal,Medical Consultant,Mon-Fri,16:30-18:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,General,Dr. Bhavesh Panchal,Medical Consultant,Sat,13:30-15:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,General,Dr. Vinita Bhoja Shetty,Medical Officer,Mon-Fri,09:00-17:30,13:00-13:30,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,Physiotherapy,Dr. Himani Patel,Visiting Physio Consultant,Tue-Thu,09:30-12:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,Physiotherapy,Dr. Himani Patel,Visiting Physio Consultant,Fri-Sat,16:00-19:00,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,Physiotherapy,Dr. Darshan Patel,Visiting Physio Consultant,Sat,16:00-18:00,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,General,Dr. Navdeep Tiwari,Resident Medical Doctor,Mon-Fri,17:00-21:00,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,General,Dr. Navdeep Tiwari,Resident Medical Doctor,Sun,10:00-13:00,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,Psychiatry,Dr. Harshil Shah,Consultant Psychiatrist,Wed,09:30-11:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,Gynecology,Dr. Jainisha Patel,Gynecologist,Wed & Fri,15:00-16:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
IITGN Medical Center,Pediatrics,Dr. Maulik Kapadia,Pediatrician,Tue & Thu,15:00-16:30,,https://iitgn.ac.in/medicalcenter/book-appointment,079-2395-1116,"IIT Gandhinagar, Palaj, Gandhinagar, GJ 382355",https://iitgn.ac.in/medicalcenter,+91-70-69-79-5000
"""

with open("iitgn_medicalcenter.csv", "w", encoding="utf-8") as f:
    f.write(csv_data)
