In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

In [6]:
def scrape_dynamic_text(url):
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    time.sleep(5)  # Give time for page to load JS content

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    paragraphs = soup.find_all('p')
    text = ""
    for p in paragraphs:
        cleaned = p.get_text(strip=True)
        if len(cleaned) > 30:
            text += cleaned + "\n"

    return text

In [20]:
# Scrape
url = "https://thegraceandmoxielife.com/21-health-and-fitness-questions-you-probably-want-answers-to/"
content = scrape_dynamic_text(url)

# Save to file
with open("FAQs.txt", "w", encoding="utf-8") as f:
    f.write(content)

print("Done. Data saved to uae_health_data.txt")

Done. Data saved to uae_health_data.txt


Cleaning and Preprocessing

In [27]:
import re

def smart_preprocess(text):
    lines = text.splitlines()
    blocks = []
    buffer = []
    current_title = None
    is_question = False

    for line in lines:
        line = line.strip()
        if not line:
            continue  # skip empty lines

        match = re.match(r"^(\d{1,2}\.|-)\s*(.*)", line)
        if match:
            # Save previous block before starting a new one
            if current_title:
                blocks.append({
                    "type": "qna" if is_question else "info",
                    "title": current_title,
                    "content": " ".join(buffer).strip()
                })
                buffer = []

            current_title = match.group(2).strip()
            is_question = "?" in current_title
        else:
            buffer.append(line)

    # Save last block
    if current_title:
        blocks.append({
            "type": "qna" if is_question else "info",
            "title": current_title,
            "content": " ".join(buffer).strip()
        })

    return blocks

# === STEP 1: Load your raw file ===
input_path = "FAQs2.txt"   # Change this to your file name
output_path = "preprocessed_output2.txt"

with open(input_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

# === STEP 2: Preprocess ===
processed_blocks = smart_preprocess(raw_text)

# === STEP 3: Save the clean output ===
with open(output_path, "w", encoding="utf-8") as f:
    for block in processed_blocks:
        f.write(f"[Type] {block['type'].upper()}\n")
        f.write(f"[Title] {block['title']}\n")
        f.write(f"[Content] {block['content']}\n")
        f.write("-" * 50 + "\n")

print("✅ Preprocessed file saved to:", output_path)


✅ Preprocessed file saved to: preprocessed_output2.txt
