In [None]:

import re
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

#  CONFIG 
OUTPUT_FILE      ="dataset_clean.txt"
CATEGORY_BASE_URL="https://www.urdupoint.com/kids/category/moral-stories.html"
PAGE_URL_TEMPLATE="https://www.urdupoint.com/kids/category/moral-stories-page{page}.html"
STORY_LINK_PATTERN=re.compile(r"https://www\.urdupoint\.com/kids/detail/moral-stories/[^\"']+\.html")

# How many listing pages to crawl (None=auto-detect from last-page link)
MAX_PAGES=67   # set e.g. to 5 to collect only the first 5 pages (≈60 stories)

#  TEXT PROCESSING 
def normalize_urdu(text):
    text=re.sub(r'[A-Za-z0-9]+', '', text)
    text=re.sub(r'[^\u0600-\u06FF\s۔،؟!\n]', '', text)
    text=re.sub(r'[ \t]+', ' ', text)
    text=re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def sentence_split(paragraph):
    parts=re.split(r'([۔؟!])', paragraph)
    sentences=[]
    for i in range(0, len(parts) - 1, 2):
        sent=parts[i].strip()
        if sent:
            punct=parts[i + 1] if i + 1 < len(parts) else '۔'
            sentences.append(sent + punct)
    return sentences

def clean_title(title):
    title=re.sub(r'تحریر نمبر\s*\d+', '', title)
    title=re.sub(r'-', '', title)
    return title.strip()

#  BROWSER SETUP
print("Initializing browser...")
options=uc.ChromeOptions()
options.page_load_strategy='eager'
driver=uc.Chrome(options=options, version_main=None)
driver.set_page_load_timeout(15)
wait=WebDriverWait(driver, 10)

# Pass Cloudflare on first load
print("Passing Cloudflare check...")
driver.get(CATEGORY_BASE_URL)
time.sleep(7)
print("Cloudflare passed!\n")

#  PHASE 1: COLLECT ALL STORY URLs FROM LISTING PAGES 
def get_total_pages(driver):
    """Read the 'Last Page' link to find out how many listing pages exist."""
    try:
        last_page_link=driver.find_element(By.XPATH, "//a[contains(text(),'Last Page')]")
        href=last_page_link.get_attribute("href")
        match=re.search(r'page(\d+)\.html', href)
        if match:
            return int(match.group(1))
    except NoSuchElementException:
        pass
    return 1

def collect_story_urls_from_page(driver):
    """Return all story detail URLs found on the current listing page."""
    urls=[]
    try:
        # wait for story links to appear
        wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "a[href*='/kids/detail/moral-stories/']")
        ))
    except TimeoutException:
        pass
    anchors=driver.find_elements(By.CSS_SELECTOR, "a[href*='/kids/detail/moral-stories/']")
    for a in anchors:
        href=a.get_attribute("href") or ""
        if STORY_LINK_PATTERN.match(href) and href not in urls:
            urls.append(href)
    return urls

# Detect total pages from the first (already loaded) listing page
total_pages=get_total_pages(driver)
if MAX_PAGES:
    total_pages=min(total_pages, MAX_PAGES)
print(f"Listing pages to crawl: {total_pages}")

story_urls=[]

for page_num in range(1, total_pages + 1):
    if page_num == 1:
        # already on page 1
        pass
    else:
        url=PAGE_URL_TEMPLATE.format(page=page_num)
        driver.get(url)
        time.sleep(1.5)   # polite delay; increase if you see Cloudflare blocks

    page_urls=collect_story_urls_from_page(driver)
    story_urls.extend(page_urls)
    print(f"  Page {page_num}/{total_pages} → {len(page_urls)} stories (total so far: {len(story_urls)})")

# Remove duplicates while preserving order
seen=set()
unique_story_urls=[]
for u in story_urls:
    if u not in seen:
        seen.add(u)
        unique_story_urls.append(u)

print(f"\nCollected {len(unique_story_urls)} unique story URLs\n")

#  PHASE 2: SCRAPE EACH STORY 
driver.set_page_load_timeout(10)

for idx, url in enumerate(unique_story_urls, start=1):
    print(f"[{idx}/{len(unique_story_urls)}] {url}")
    try:
        driver.get(url)

        # Title
        title_element=wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h2.urdu"))
        )
        title=clean_title(title_element.text)

        # Story body
        story_div=wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.txt_detail"))
        )
        story_text=normalize_urdu(story_div.text)
        paragraphs=[p.strip() for p in story_text.split('\n') if p.strip()]

        # Write to file
        with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
            f.write(title + "\n\n")
            for para in paragraphs:
                for sent in sentence_split(para):
                    f.write(sent + " <EOS>\n")
                f.write("<EOP>\n")
            f.write("<EOT>\n\n")

        print(f"  {title}")

    except Exception as e:
        print(f"  ✗ Error: {str(e)[:80]}")
        continue

driver.quit()
print("\nDone!")


Initializing browser...
Passing Cloudflare check...
✓ Cloudflare passed!

Listing pages to crawl: 67
  Page 1/67 → 12 stories (total so far: 12)
  Page 2/67 → 12 stories (total so far: 24)
  Page 3/67 → 12 stories (total so far: 36)
  Page 4/67 → 12 stories (total so far: 48)
  Page 5/67 → 12 stories (total so far: 60)
  Page 6/67 → 12 stories (total so far: 72)
  Page 7/67 → 12 stories (total so far: 84)
  Page 8/67 → 12 stories (total so far: 96)
  Page 9/67 → 12 stories (total so far: 108)
  Page 10/67 → 12 stories (total so far: 120)
  Page 11/67 → 12 stories (total so far: 132)
  Page 12/67 → 12 stories (total so far: 144)
  Page 13/67 → 12 stories (total so far: 156)
  Page 14/67 → 12 stories (total so far: 168)
  Page 15/67 → 12 stories (total so far: 180)
  Page 16/67 → 12 stories (total so far: 192)
  Page 17/67 → 12 stories (total so far: 204)
  Page 18/67 → 12 stories (total so far: 216)
  Page 19/67 → 12 stories (total so far: 228)
  Page 20/67 → 12 stories (total so far: 2