In [18]:
# Install required packages
!pip install webdriver-manager selenium

# Check Chrome browser version
import subprocess
import sys

def check_chrome_version():
    try:
        # Try to get Chrome version on Windows
        result = subprocess.run([
            'reg', 'query', 'HKEY_CURRENT_USER\\Software\\Google\\Chrome\\BLBeacon', 
            '/v', 'version'
        ], capture_output=True, text=True, shell=True)
        
        if result.returncode == 0:
            version = result.stdout.split()[-1]
            print(f"Chrome version: {version}")
        else:
            print("Could not detect Chrome version from registry")
            
        # Alternative method
        result2 = subprocess.run([
            'powershell', '-command', 
            '(Get-Item "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe").VersionInfo.ProductVersion'
        ], capture_output=True, text=True, shell=True)
        
        if result2.returncode == 0 and result2.stdout.strip():
            version2 = result2.stdout.strip()
            print(f"Chrome version (alternative): {version2}")
            
    except Exception as e:
        print(f"Could not check Chrome version: {e}")
        print("Please ensure Google Chrome is installed")

check_chrome_version()

Chrome version: 139.0.7258.128
Chrome version (alternative): 139.0.7258.128
Chrome version (alternative): 139.0.7258.128


In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
import os

# TESTING MODE: Set to True to test with only first 5 letters
TESTING_MODE = True
MAX_LETTERS_FOR_TESTING = 5

chrome_options = Options()
# Add stability options
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--remote-debugging-port=9222")
# chrome_options.add_argument("--headless=new")  # optional headless mode

try:
    # Use webdriver-manager to automatically download and manage ChromeDriver
    driver_path = ChromeDriverManager().install()
    print(f"ChromeDriver path: {driver_path}")
    
    # Verify the driver file exists and is executable
    if not os.path.exists(driver_path):
        raise FileNotFoundError(f"ChromeDriver not found at {driver_path}")
    
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    print("Chrome browser started successfully!")
    
except Exception as e:
    print(f"Error starting Chrome: {e}")
    print("Trying alternative approach...")
    try:
        # Try without specifying service (let Selenium find Chrome automatically)
        driver = webdriver.Chrome(options=chrome_options)
        print("Chrome started with default driver!")
    except Exception as e2:
        print(f"Alternative approach failed: {e2}")
        print("Please ensure Chrome browser is installed and updated to the latest version.")
        raise

driver.get("https://corpus.quran.com/qurandictionary.jsp")
print("Page loaded successfully!")

if TESTING_MODE:
    print(f"🧪 TESTING MODE: Will process only first {MAX_LETTERS_FOR_TESTING} letters")

# Wait for page to fully load
time.sleep(3)

data = []

# Debug: Check page content first
print("Debugging: Checking page content...")
page_title = driver.title
print(f"Page title: {page_title}")

# Look for Arabic alphabet letters specifically
print("Looking for Arabic alphabet letters...")

# Collect letter information (text and URL) instead of element references
letter_data = []

# First, let's examine the page structure around alphabet letters
try:
    # Look for elements that might contain Arabic letters
    # Try finding by text content (Arabic letters)
    all_links = driver.find_elements(By.TAG_NAME, "a")
    
    # Filter for Arabic letters and collect their data
    for link in all_links:
        text = link.text.strip()
        href = link.get_attribute('href') or ''
        
        # Check for both 'letter=' and 'q=' parameters (the site seems to use 'q=')
        if (len(text) == 1 and 
            ('letter=' in href or 'q=' in href) and 
            ord(text) >= 0x0600):  # Arabic Unicode range starts around 0x0600
            letter_data.append({'text': text, 'url': href})
        
        # Also check for common Arabic letters by name
        if text in ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي']:
            # Check if not already added
            if not any(item['text'] == text for item in letter_data):
                letter_data.append({'text': text, 'url': href})
    
    print(f"Found {len(letter_data)} Arabic letter links by content")
    
    # If no Arabic letters found, try alternative approaches
    if not letter_data:
        # Look for any links with 'letter=' or 'q=' in href
        for link in all_links:
            href = link.get_attribute('href') or ''
            if 'letter=' in href or 'q=' in href:
                text = link.text.strip()
                letter_data.append({'text': text, 'url': href})
        
        print(f"Found {len(letter_data)} links with 'letter=' or 'q=' parameter")

except Exception as e:
    print(f"Error finding Arabic letters: {e}")

if not letter_data:
    print("No alphabet letters found!")
    driver.quit()
    raise Exception("Could not find Arabic alphabet letters.")

# TESTING MODE: Limit letters if in testing mode
if TESTING_MODE:
    letter_data = letter_data[:MAX_LETTERS_FOR_TESTING]
    print(f"🧪 Testing mode: Limited to {len(letter_data)} letters")

print(f"\nFound {len(letter_data)} alphabet letters to process")

# Show the letters we found
for i, item in enumerate(letter_data[:10]):
    print(f"  Letter {i+1}: '{item['text']}' -> {item['url']}")

if len(letter_data) > 10:
    print(f"  ... and {len(letter_data) - 10} more letters")

# Now process each letter using URLs instead of element references
for letter_idx, letter_info in enumerate(letter_data):
    letter_text = letter_info['text']
    letter_url = letter_info['url']
    
    if not letter_text:  # Skip empty letters
        continue
        
    print(f"\n=== Processing letter {letter_idx + 1}/{len(letter_data)}: {letter_text} ===")

    try:
        # Navigate directly to the letter URL
        print(f"Navigating to: {letter_url}")
        driver.get(letter_url)
        time.sleep(4)
        
        # Debug: Check what page we landed on
        current_url = driver.current_url
        current_title = driver.title
        print(f"Current URL: {current_url}")
        print(f"Current title: {current_title}")

        # Step 2: Look for dropdown of words and collect all keyword texts first
        try:
            print("Looking for dropdown...")
            
            # Wait for dropdown to be present
            dropdown_element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.NAME, "entryList"))
            )
            dropdown = Select(dropdown_element)
            
            print("Dropdown found!")
            
            # Collect all keyword texts first to avoid stale element issues
            keyword_texts = []
            for option in dropdown.options:
                keyword_text = option.text.strip()
                if keyword_text:
                    keyword_texts.append(keyword_text)
            
            options_count = len(keyword_texts)
            print(f"  Found {options_count} keywords for letter {letter_text}")
            
            # Show first few options for debugging
            for i, keyword in enumerate(keyword_texts[:5]):
                print(f"    Option {i}: '{keyword}'")
            
            if options_count > 5:
                print(f"    ... and {options_count - 5} more options")

            # Process each keyword using the collected texts
            for i, keyword in enumerate(keyword_texts):
                print(f"  → Processing keyword {i+1}/{options_count}: '{keyword}'")
                
                try:
                    # Navigate back to letter page first to ensure fresh dropdown
                    driver.get(letter_url)
                    time.sleep(2)
                    
                    # Re-find the dropdown and select the option by text
                    dropdown_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.NAME, "entryList"))
                    )
                    dropdown = Select(dropdown_element)
                    dropdown.select_by_visible_text(keyword)
                    
                    # Find and click Go button
                    go_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[value='Go']"))
                    )
                    go_button.click()
                    time.sleep(3)
                    
                    print(f"    Clicked Go for keyword: {keyword}")

                    # Step 3: Scrape description - IMPROVED
                    description = ""
                    try:
                        # Look for the proper noun description
                        desc_selectors = [
                            "//h3[contains(text(), 'Proper noun')]/following-sibling::p",
                            "//td[contains(text(), 'Proper noun')]/parent::tr/following-sibling::tr//td",
                            "//div[contains(@class, 'content')]//p[not(contains(text(), 'use one or more keywords'))]",
                            "//p[contains(text(), 'occurs') and contains(text(), 'times')]"
                        ]
                        
                        for selector in desc_selectors:
                            try:
                                desc_elem = driver.find_element(By.XPATH, selector)
                                description = desc_elem.text.strip()
                                if description and not description.startswith("You can use"):
                                    break
                            except:
                                continue
                                
                        if description:
                            print(f"    Found description: {description[:100]}...")
                        else:
                            print("    No description found")
                            
                    except Exception as desc_error:
                        print(f"    Description error: {desc_error}")

                    # Step 4: Scrape ayahs - FIXED to get actual data rows
                    print("    Looking for ayah table...")
                    
                    # Try multiple selectors for the ayah data table
                    ayah_selectors = [
                        "//table[contains(@class, 'corpus')]//tr[td[contains(@class, 'ref')]]",
                        "//table//tr[td[1][contains(text(), ':')]]",  # Rows with verse references like "2:31"
                        "//table//tr[count(td) >= 4 and td[1][contains(text(), ':')]]",  # Rows with 4+ columns and verse refs
                        "//tr[td[1][contains(text(), ':')]]"  # Any rows with verse references
                    ]
                    
                    ayah_rows = []
                    for selector in ayah_selectors:
                        try:
                            rows = driver.find_elements(By.XPATH, selector)
                            if rows:
                                ayah_rows = rows
                                print(f"    Using selector: {selector}")
                                break
                        except:
                            continue
                    
                    if not ayah_rows:
                        # Fallback: get all table rows and filter manually
                        all_rows = driver.find_elements(By.XPATH, "//table//tr")
                        ayah_rows = []
                        for row in all_rows:
                            cols = row.find_elements(By.TAG_NAME, "td")
                            if len(cols) >= 3:
                                first_col_text = cols[0].text.strip()
                                # Check if first column looks like a verse reference (e.g., "2:31", "3:33")
                                if ':' in first_col_text and any(char.isdigit() for char in first_col_text):
                                    ayah_rows.append(row)
                    
                    print(f"    Found {len(ayah_rows)} ayah rows")
                    
                    occurrences = []
                    for row_idx, row in enumerate(ayah_rows):
                        cols = row.find_elements(By.TAG_NAME, "td")
                        if len(cols) >= 3:
                            try:
                                ref_text = cols[0].text.strip()
                                # Skip header rows or non-verse rows
                                if not ref_text or ':' not in ref_text or ref_text.lower() in ['reference', 'verse', 'ayah']:
                                    continue
                                    
                                occurrence = {
                                    "ref": ref_text,
                                    "variation": cols[1].text.strip() if len(cols) > 1 else "",
                                    "translation": cols[2].text.strip() if len(cols) > 2 else "",
                                    "ayah_arabic": cols[-1].text.strip() if len(cols) > 3 else cols[2].text.strip()
                                }
                                
                                # Only add if we have meaningful data
                                if occurrence["ref"] and occurrence["ayah_arabic"]:
                                    occurrences.append(occurrence)
                                    
                                # Debug: Show first few occurrences
                                if row_idx < 3:
                                    print(f"      Row {row_idx}: {ref_text} -> {occurrence['variation'][:20]}...")
                                    
                            except Exception as row_error:
                                print(f"      Error processing row {row_idx}: {row_error}")
                                continue

                    data.append({
                        "letter": letter_text,
                        "keyword": keyword,
                        "description": description,
                        "occurrences": occurrences
                    })

                    print(f"    ✓ Scraped {len(occurrences)} occurrences for '{keyword}'")
                    
                    # Save progress every 5 entries in testing mode
                    save_interval = 5 if TESTING_MODE else 10
                    if len(data) % save_interval == 0:
                        with open("quran_dictionary_progress.json", "w", encoding="utf-8") as f:
                            json.dump(data, f, ensure_ascii=False, indent=2)
                        print(f"    Progress saved: {len(data)} total entries")
                    
                except Exception as keyword_error:
                    print(f"    !! Error processing keyword '{keyword}': {keyword_error}")
                    continue

        except Exception as dropdown_error:
            print(f"  !! No dropdown found for letter {letter_text}: {dropdown_error}")

    except Exception as letter_error:
        print(f"  !! Error processing letter {letter_text}: {letter_error}")
        continue

print(f"\n🎉 Scraping completed! Total entries: {len(data)}")

# Save final results
filename = "quran_dictionary_test.json" if TESTING_MODE else "quran_dictionary.json"
with open(filename, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ Data saved to {filename}")

# Show summary
if data:
    letters_processed = set(item['letter'] for item in data)
    print(f"📊 Summary:")
    print(f"   - Letters processed: {len(letters_processed)}")
    print(f"   - Total keywords: {len(data)}")
    print(f"   - Total ayah occurrences: {sum(len(item['occurrences']) for item in data)}")
    
    # Show some sample data
    if len(data) > 0:
        print(f"\n📝 Sample entries:")
        for i, entry in enumerate(data[:3]):
            print(f"   {i+1}. Letter '{entry['letter']}' - Keyword '{entry['keyword']}' - {len(entry['occurrences'])} occurrences")
else:
    print("❌ No data was scraped. Please check the debugging output above.")

if TESTING_MODE:
    print(f"\n🧪 Testing completed with {len(letter_data)} letters. Change TESTING_MODE to False to run on all letters.")

driver.quit()

ChromeDriver path: C:\Users\dell\.wdm\drivers\chromedriver\win64\139.0.7258.68\chromedriver-win32/THIRD_PARTY_NOTICES.chromedriver
Error starting Chrome: [WinError 193] %1 is not a valid Win32 application
Trying alternative approach...
Alternative approach failed: Message: session not created
from disconnected: unable to connect to renderer; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
	GetHandleVerifier [0x0x7ff660106b25+79621]
	GetHandleVerifier [0x0x7ff660106b80+79712]
	(No symbol) [0x0x7ff65fe9c0ea]
	(No symbol) [0x0x7ff65fee064b]
	(No symbol) [0x0x7ff65fedafd8]
	(No symbol) [0x0x7ff65fed5385]
	(No symbol) [0x0x7ff65ff28c5e]
	(No symbol) [0x0x7ff65ff283f0]
	(No symbol) [0x0x7ff65ff1af13]
	(No symbol) [0x0x7ff65fee4151]
	(No symbol) [0x0x7ff65fee4ee3]
	GetHandleVerifier [0x0x7ff6603c683d+2962461]
	GetHandleVerifier [0x0x7ff6603c0b5d+2938685]
	GetHandleVerifier [0x0x7ff660

SessionNotCreatedException: Message: session not created
from disconnected: unable to connect to renderer; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
	GetHandleVerifier [0x0x7ff660106b25+79621]
	GetHandleVerifier [0x0x7ff660106b80+79712]
	(No symbol) [0x0x7ff65fe9c0ea]
	(No symbol) [0x0x7ff65fee064b]
	(No symbol) [0x0x7ff65fedafd8]
	(No symbol) [0x0x7ff65fed5385]
	(No symbol) [0x0x7ff65ff28c5e]
	(No symbol) [0x0x7ff65ff283f0]
	(No symbol) [0x0x7ff65ff1af13]
	(No symbol) [0x0x7ff65fee4151]
	(No symbol) [0x0x7ff65fee4ee3]
	GetHandleVerifier [0x0x7ff6603c683d+2962461]
	GetHandleVerifier [0x0x7ff6603c0b5d+2938685]
	GetHandleVerifier [0x0x7ff6603df71d+3064573]
	GetHandleVerifier [0x0x7ff660120c6e+186446]
	GetHandleVerifier [0x0x7ff660128a3f+218655]
	GetHandleVerifier [0x0x7ff66010f914+115956]
	GetHandleVerifier [0x0x7ff66010fac9+116393]
	GetHandleVerifier [0x0x7ff6600f5ef8+10968]
	BaseThreadInitThunk [0x0x7ff96a1ae8d7+23]
	RtlUserThreadStart [0x0x7ff96bd1c34c+44]


In [17]:
# Quick test to examine page structure
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

try:
    driver = webdriver.Chrome(options=chrome_options)
    
    # Test with first letter
    test_url = "https://corpus.quran.com/qurandictionary.jsp?q=A"
    print(f"Testing URL: {test_url}")
    driver.get(test_url)
    time.sleep(3)
    
    print(f"Page title: {driver.title}")
    print(f"Current URL: {driver.current_url}")
    
    # Look for all form elements
    print("\n=== All form elements ===")
    forms = driver.find_elements(By.TAG_NAME, "form")
    print(f"Found {len(forms)} forms")
    
    for i, form in enumerate(forms):
        print(f"Form {i+1}:")
        inputs = form.find_elements(By.TAG_NAME, "input")
        selects = form.find_elements(By.TAG_NAME, "select")
        print(f"  - {len(inputs)} input elements")
        print(f"  - {len(selects)} select elements")
        
        for j, select in enumerate(selects):
            name = select.get_attribute('name') or 'no name'
            id_attr = select.get_attribute('id') or 'no id'
            print(f"    Select {j+1}: name='{name}', id='{id_attr}'")
    
    # Look for all select elements on page
    print("\n=== All select elements ===")
    all_selects = driver.find_elements(By.TAG_NAME, "select")
    print(f"Found {len(all_selects)} select elements total")
    
    for i, select in enumerate(all_selects):
        name = select.get_attribute('name') or 'no name'
        id_attr = select.get_attribute('id') or 'no id'
        options_count = len(select.find_elements(By.TAG_NAME, "option"))
        print(f"  Select {i+1}: name='{name}', id='{id_attr}', options={options_count}")
        
        # Show first few options
        if options_count > 0:
            options = select.find_elements(By.TAG_NAME, "option")
            for j, option in enumerate(options[:3]):
                text = option.text.strip()
                print(f"    Option {j+1}: '{text}'")
            if options_count > 3:
                print(f"    ... and {options_count - 3} more options")
    
    # Look for input elements
    print("\n=== Input elements ===")
    all_inputs = driver.find_elements(By.TAG_NAME, "input")
    for i, input_elem in enumerate(all_inputs):
        type_attr = input_elem.get_attribute('type') or 'no type'
        name = input_elem.get_attribute('name') or 'no name'
        value = input_elem.get_attribute('value') or 'no value'
        print(f"  Input {i+1}: type='{type_attr}', name='{name}', value='{value}'")
    
    # Show a larger snippet of page source
    print("\n=== Page source (first 2000 chars) ===")
    print(driver.page_source[:2000])
    
    driver.quit()
    
except Exception as e:
    print(f"Error in test: {e}")
    try:
        driver.quit()
    except:
        pass

Testing URL: https://corpus.quran.com/qurandictionary.jsp?q=A
Page title: The Quranic Arabic Corpus - Quran Dictionary
Current URL: https://corpus.quran.com/qurandictionary.jsp?q=A

=== All form elements ===
Found 0 forms

=== All select elements ===
Found 1 select elements total
  Select 1: name='entryList', id='entryList', options=90
    Option 1: 'آدَم'
    Option 2: 'آزَر'
    Option 3: 'أَبَارِيق'
    ... and 87 more options

=== Input elements ===
  Input 1: type='text', name='no name', value='no value'
  Input 2: type='button', name='no name', value='Go'

=== Page source (first 2000 chars) ===
<html><head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
<title>The Quranic Arabic Corpus - Quran Dictionary</title>
<link href="/css/main.css" type="text/css" rel="stylesheet">
<script type="text/javascript">function search() { var e = document.getElementById("searchTextBox");var url = "/search.jsp?q=" + encodeURIComponent(e.value);document.location.href = url;}</

In [22]:
# Debug: Test what happens when we click a keyword
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

try:
    driver = webdriver.Chrome(options=chrome_options)
    
    # Test with first letter
    test_url = "https://corpus.quran.com/qurandictionary.jsp?q=A"
    print(f"Testing URL: {test_url}")
    driver.get(test_url)
    time.sleep(3)
    
    print(f"Page title: {driver.title}")
    print(f"Current URL: {driver.current_url}")
    
    # Find dropdown and select first keyword
    dropdown_element = driver.find_element(By.NAME, "entryList")
    dropdown = Select(dropdown_element)
    
    first_keyword = dropdown.options[0].text.strip()
    print(f"First keyword: '{first_keyword}'")
    
    # Select the keyword
    dropdown.select_by_visible_text(first_keyword)
    
    # Click Go
    go_button = driver.find_element(By.CSS_SELECTOR, "input[value='Go']")
    go_button.click()
    time.sleep(3)
    
    print(f"\nAfter clicking Go:")
    print(f"Current URL: {driver.current_url}")
    print(f"Page title: {driver.title}")
    
    # Look for all possible content on the page
    print(f"\n=== Page structure analysis ===")
    
    # Check for tables
    tables = driver.find_elements(By.TAG_NAME, "table")
    print(f"Found {len(tables)} tables")
    
    for i, table in enumerate(tables):
        rows = table.find_elements(By.TAG_NAME, "tr")
        print(f"  Table {i+1}: {len(rows)} rows")
        
        # Show first few rows
        for j, row in enumerate(rows[:3]):
            cols = row.find_elements(By.TAG_NAME, "td")
            if cols:
                row_text = " | ".join([col.text.strip()[:30] for col in cols])
                print(f"    Row {j+1}: {row_text}")
    
    # Check for any divs with content
    print(f"\n=== Content divs ===")
    content_divs = driver.find_elements(By.XPATH, "//div[contains(@class, 'content') or contains(@class, 'main') or contains(@class, 'body')]")
    for i, div in enumerate(content_divs):
        text = div.text.strip()[:200]
        print(f"  Div {i+1}: {text}...")
    
    # Check for any text mentioning the keyword
    print(f"\n=== Looking for keyword '{first_keyword}' on page ===")
    page_text = driver.page_source
    if first_keyword in page_text:
        print(f"✓ Keyword '{first_keyword}' found on page")
        
        # Find elements containing the keyword
        elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{first_keyword}')]")
        print(f"Found {len(elements)} elements containing the keyword")
        
        for i, elem in enumerate(elements[:3]):
            print(f"  Element {i+1}: {elem.text.strip()[:100]}...")
    else:
        print(f"✗ Keyword '{first_keyword}' NOT found on page")
    
    # Show larger page source snippet
    print(f"\n=== Page source (first 1000 chars) ===")
    print(driver.page_source[:1000])
    
    driver.quit()
    
except Exception as e:
    print(f"Error in debug: {e}")
    try:
        driver.quit()
    except:
        pass

Testing URL: https://corpus.quran.com/qurandictionary.jsp?q=A
Page title: The Quranic Arabic Corpus - Quran Dictionary
Current URL: https://corpus.quran.com/qurandictionary.jsp?q=A
First keyword: 'آدَم'
Page title: The Quranic Arabic Corpus - Quran Dictionary
Current URL: https://corpus.quran.com/qurandictionary.jsp?q=A
First keyword: 'آدَم'

After clicking Go:
Current URL: https://corpus.quran.com/searchhelp.jsp
Page title: The Quranic Arabic Corpus - Search Help

=== Page structure analysis ===
Found 3 tables
  Table 1: 1 rows
    Row 1:  | Qur'an | Word by Word | Audio 
  Table 2: 16 rows
    Row 1:  | __ | Sign In | Search
    Row 2:  | 
    Row 3:  | __ | Search Help
  Table 3: 11 rows
    Row 1: Word by Word
    Row 2: Quran Dictionary
    Row 3: English Translation

=== Content divs ===
  Div 1: You can use one or more keywords when searching the word by word translation, or you can use quotes to find an exact phrase.
Ibrahim - find all verses containing the word Ibrahim
they 

In [23]:
# FIXED VERSION: Correct form handling for keyword selection
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# TESTING MODE: Set to True to test with only first 2 letters
TESTING_MODE = True
MAX_LETTERS_FOR_TESTING = 2

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

try:
    driver = webdriver.Chrome(options=chrome_options)
    print("Chrome browser started successfully!")
    
except Exception as e:
    print(f"Error starting Chrome: {e}")
    raise

driver.get("https://corpus.quran.com/qurandictionary.jsp")
print("Page loaded successfully!")

if TESTING_MODE:
    print(f"🧪 TESTING MODE: Will process only first {MAX_LETTERS_FOR_TESTING} letters")

time.sleep(3)
data = []

# Collect letter information
letter_data = []
all_links = driver.find_elements(By.TAG_NAME, "a")

for link in all_links:
    text = link.text.strip()
    href = link.get_attribute('href') or ''
    
    if (len(text) == 1 and 'q=' in href and ord(text) >= 0x0600):
        letter_data.append({'text': text, 'url': href})

if TESTING_MODE:
    letter_data = letter_data[:MAX_LETTERS_FOR_TESTING]

print(f"Found {len(letter_data)} alphabet letters to process")

# Process each letter
for letter_idx, letter_info in enumerate(letter_data):
    letter_text = letter_info['text']
    letter_url = letter_info['url']
    
    print(f"\n=== Processing letter {letter_idx + 1}/{len(letter_data)}: {letter_text} ===")

    try:
        driver.get(letter_url)
        time.sleep(3)
        
        print(f"Current URL: {driver.current_url}")

        # Find dropdown and collect keywords
        try:
            dropdown_element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.NAME, "entryList"))
            )
            dropdown = Select(dropdown_element)
            
            # Collect all option values (not just text) - this is the key fix!
            keyword_data = []
            for option in dropdown.options:
                keyword_text = option.text.strip()
                keyword_value = option.get_attribute('value')  # This is crucial!
                if keyword_text and keyword_value:
                    keyword_data.append({'text': keyword_text, 'value': keyword_value})
            
            print(f"Found {len(keyword_data)} keywords for letter {letter_text}")
            
            # Show first few for debugging
            for i, kw in enumerate(keyword_data[:3]):
                print(f"  Keyword {i+1}: '{kw['text']}' (value: {kw['value']})")

            # Process each keyword using the option values
            for i, keyword_info in enumerate(keyword_data[:5]):  # Limit to 5 for testing
                keyword_text = keyword_info['text']
                keyword_value = keyword_info['value']
                
                print(f"  → Processing keyword {i+1}: '{keyword_text}'")
                
                try:
                    # Navigate back to letter page
                    driver.get(letter_url)
                    time.sleep(2)
                    
                    # Re-find dropdown and select by VALUE (not text)
                    dropdown_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.NAME, "entryList"))
                    )
                    dropdown = Select(dropdown_element)
                    dropdown.select_by_value(keyword_value)  # Use value instead of text
                    
                    # Click Go button
                    go_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[value='Go']"))
                    )
                    go_button.click()
                    time.sleep(3)
                    
                    final_url = driver.current_url
                    print(f"    Final URL: {final_url}")
                    
                    # Check if we landed on the right page (not searchhelp.jsp)
                    if 'searchhelp.jsp' in final_url:
                        print(f"    ⚠️  Redirected to search help - skipping")
                        continue
                    
                    # Scrape description
                    description = ""
                    try:
                        # Look for keyword description in various locations
                        desc_selectors = [
                            f"//h3[contains(text(), 'Proper noun') or contains(text(), '{keyword_text}')]/following-sibling::p",
                            "//div[contains(@class, 'content')]//p[contains(text(), 'occurs') or contains(text(), 'times')]",
                            "//td[contains(text(), 'definition') or contains(text(), 'meaning')]/following-sibling::td",
                            "//p[not(contains(text(), 'search')) and not(contains(text(), 'help'))]"
                        ]
                        
                        for selector in desc_selectors:
                            try:
                                elements = driver.find_elements(By.XPATH, selector)
                                for elem in elements:
                                    text = elem.text.strip()
                                    if (text and 
                                        len(text) > 10 and 
                                        not text.startswith("You can use") and
                                        not text.startswith("If you know")):
                                        description = text
                                        break
                                if description:
                                    break
                            except:
                                continue
                                
                        print(f"    Description: {description[:100] if description else 'None found'}...")
                            
                    except Exception as desc_error:
                        print(f"    Description error: {desc_error}")

                    # Scrape ayahs with improved detection
                    print("    Looking for ayah data...")
                    
                    ayah_rows = []
                    try:
                        # First, try to find the data table specifically
                        table_selectors = [
                            "//table[.//td[contains(text(), ':')]]",  # Tables with verse references
                            "//table[contains(@class, 'corpus')]",
                            "//table[count(.//tr) > 3]"  # Tables with multiple rows
                        ]
                        
                        data_table = None
                        for selector in table_selectors:
                            try:
                                tables = driver.find_elements(By.XPATH, selector)
                                for table in tables:
                                    rows = table.find_elements(By.XPATH, ".//tr[td[contains(text(), ':')]]")
                                    if rows:
                                        data_table = table
                                        ayah_rows = rows
                                        print(f"    Found data table with {len(ayah_rows)} rows")
                                        break
                                if data_table:
                                    break
                            except:
                                continue
                        
                        # If no specific table found, look for any rows with verse references
                        if not ayah_rows:
                            ayah_rows = driver.find_elements(By.XPATH, "//tr[td[contains(text(), ':') and contains(text(), numbers)]]")
                            if not ayah_rows:
                                # More general approach
                                all_rows = driver.find_elements(By.XPATH, "//tr[count(td) >= 3]")
                                for row in all_rows:
                                    first_cell = row.find_elements(By.TAG_NAME, "td")[0]
                                    if ':' in first_cell.text and any(c.isdigit() for c in first_cell.text):
                                        ayah_rows.append(row)
                    
                    except Exception as table_error:
                        print(f"    Table search error: {table_error}")
                    
                    print(f"    Found {len(ayah_rows)} potential ayah rows")
                    
                    occurrences = []
                    for row_idx, row in enumerate(ayah_rows[:10]):  # Limit for testing
                        try:
                            cols = row.find_elements(By.TAG_NAME, "td")
                            if len(cols) >= 3:
                                ref_text = cols[0].text.strip()
                                
                                # Validate this looks like a verse reference
                                if (':' in ref_text and 
                                    any(c.isdigit() for c in ref_text) and 
                                    not ref_text.lower() in ['reference', 'verse', 'chapter']):
                                    
                                    occurrence = {
                                        "ref": ref_text,
                                        "variation": cols[1].text.strip() if len(cols) > 1 else "",
                                        "translation": cols[2].text.strip() if len(cols) > 2 else "",
                                        "arabic": cols[-1].text.strip() if len(cols) > 3 else ""
                                    }
                                    
                                    # Only add if we have meaningful content
                                    if occurrence["ref"] and (occurrence["translation"] or occurrence["arabic"]):
                                        occurrences.append(occurrence)
                                        
                                        if row_idx < 2:  # Show first couple for debugging
                                            print(f"      Row {row_idx+1}: {ref_text} -> {occurrence['translation'][:30]}...")
                                            
                        except Exception as row_error:
                            print(f"      Row {row_idx} error: {row_error}")
                            continue

                    # Store the data
                    entry = {
                        "letter": letter_text,
                        "keyword": keyword_text,
                        "description": description,
                        "occurrences": occurrences,
                        "final_url": final_url  # For debugging
                    }
                    data.append(entry)

                    print(f"    ✓ Scraped {len(occurrences)} occurrences for '{keyword_text}'")
                    
                except Exception as keyword_error:
                    print(f"    !! Error processing keyword '{keyword_text}': {keyword_error}")
                    continue

        except Exception as dropdown_error:
            print(f"  !! No dropdown found for letter {letter_text}: {dropdown_error}")

    except Exception as letter_error:
        print(f"  !! Error processing letter {letter_text}: {letter_error}")
        continue

print(f"\n🎉 Scraping completed! Total entries: {len(data)}")

# Save results
filename = "quran_dictionary_fixed_test.json"
with open(filename, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ Data saved to {filename}")

# Show summary
if data:
    print(f"📊 Summary:")
    print(f"   - Total keywords processed: {len(data)}")
    print(f"   - Total ayah occurrences: {sum(len(item['occurrences']) for item in data)}")
    
    # Show sample entries
    for i, entry in enumerate(data[:3]):
        print(f"   {i+1}. Keyword '{entry['keyword']}' - {len(entry['occurrences'])} occurrences")
        if entry['description']:
            print(f"      Description: {entry['description'][:100]}...")
else:
    print("❌ No data was scraped.")

driver.quit()

Chrome browser started successfully!
Page loaded successfully!
🧪 TESTING MODE: Will process only first 2 letters
Found 2 alphabet letters to process

=== Processing letter 1/2: أ ===
Current URL: https://corpus.quran.com/qurandictionary.jsp?q=A
Found 90 keywords for letter أ
  Keyword 1: 'آدَم' (value: A%5Edam)
  Keyword 2: 'آزَر' (value: A%5Ezar)
  Keyword 3: 'أَبَارِيق' (value: %3EabaAriyq)
  → Processing keyword 1: 'آدَم'
    Final URL: https://corpus.quran.com/searchhelp.jsp
    ⚠️  Redirected to search help - skipping
  → Processing keyword 2: 'آزَر'
    Final URL: https://corpus.quran.com/searchhelp.jsp
    ⚠️  Redirected to search help - skipping
  → Processing keyword 3: 'أَبَارِيق'
    Final URL: https://corpus.quran.com/searchhelp.jsp
    ⚠️  Redirected to search help - skipping
  → Processing keyword 4: 'أ ب ب'
    Final URL: https://corpus.quran.com/searchhelp.jsp
    ⚠️  Redirected to search help - skipping
  → Processing keyword 5: 'أ ب د'
    Final URL: https://corpu

In [24]:
# DEEP FORM ANALYSIS: Let's examine the exact form structure and parameters
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

try:
    driver = webdriver.Chrome(options=chrome_options)
    
    # Go to first letter page
    test_url = "https://corpus.quran.com/qurandictionary.jsp?q=A"
    driver.get(test_url)
    time.sleep(3)
    
    print(f"=== FORM ANALYSIS ===")
    print(f"Current URL: {driver.current_url}")
    
    # Find all forms on the page
    forms = driver.find_elements(By.TAG_NAME, "form")
    print(f"Found {len(forms)} forms")
    
    for i, form in enumerate(forms):
        print(f"\nForm {i+1}:")
        action = form.get_attribute('action') or 'no action'
        method = form.get_attribute('method') or 'no method'
        print(f"  Action: {action}")
        print(f"  Method: {method}")
        
        # Get all input and select elements in this form
        inputs = form.find_elements(By.TAG_NAME, "input")
        selects = form.find_elements(By.TAG_NAME, "select")
        
        print(f"  Inputs ({len(inputs)}):")
        for j, inp in enumerate(inputs):
            name = inp.get_attribute('name') or 'no name'
            type_attr = inp.get_attribute('type') or 'no type'
            value = inp.get_attribute('value') or 'no value'
            print(f"    Input {j+1}: name='{name}', type='{type_attr}', value='{value}'")
        
        print(f"  Selects ({len(selects)}):")
        for j, sel in enumerate(selects):
            name = sel.get_attribute('name') or 'no name'
            id_attr = sel.get_attribute('id') or 'no id'
            print(f"    Select {j+1}: name='{name}', id='{id_attr}'")
            
            # Get first few options
            options = sel.find_elements(By.TAG_NAME, "option")
            print(f"      Options ({len(options)}):")
            for k, opt in enumerate(options[:5]):
                text = opt.text.strip()
                value = opt.get_attribute('value')
                selected = opt.is_selected()
                print(f"        Option {k+1}: text='{text}', value='{value}', selected={selected}")
            if len(options) > 5:
                print(f"        ... and {len(options) - 5} more options")
    
    # Now let's try to understand the URL structure by examining different approaches
    print(f"\n=== URL PATTERN ANALYSIS ===")
    
    # Let's see what happens if we manually construct URLs
    # First, let's examine some option values in detail
    dropdown = driver.find_element(By.NAME, "entryList")
    select_obj = Select(dropdown)
    
    first_option = select_obj.options[1]  # Skip first empty option
    option_text = first_option.text.strip()
    option_value = first_option.get_attribute('value')
    
    print(f"First real option:")
    print(f"  Text: '{option_text}'")
    print(f"  Value: '{option_value}'")
    
    # Try to understand the URL encoding
    import urllib.parse
    decoded_value = urllib.parse.unquote(option_value)
    print(f"  Decoded value: '{decoded_value}'")
    
    # Let's try manually constructing a direct URL
    # Common patterns for dictionary sites:
    test_urls = [
        f"https://corpus.quran.com/qurandictionary.jsp?word={option_value}",
        f"https://corpus.quran.com/qurandictionary.jsp?entry={option_value}",
        f"https://corpus.quran.com/qurandictionary.jsp?q={option_value}",
        f"https://corpus.quran.com/wordbyword.jsp?q={option_value}",
        f"https://corpus.quran.com/search.jsp?q={option_value}",
    ]
    
    print(f"\n=== TESTING DIRECT URLs ===")
    for url in test_urls:
        try:
            print(f"Testing: {url}")
            driver.get(url)
            time.sleep(2)
            final_url = driver.current_url
            title = driver.title
            print(f"  Result: {final_url}")
            print(f"  Title: {title}")
            
            # Check if we got meaningful content (not search help)
            if 'searchhelp' not in final_url and 'search help' not in title.lower():
                print(f"  ✓ SUCCESS! This might be the right pattern")
                
                # Look for actual content
                tables = driver.find_elements(By.TAG_NAME, "table")
                content_text = driver.find_element(By.TAG_NAME, "body").text
                
                if option_text in content_text:
                    print(f"  ✓ Found keyword '{option_text}' in page content!")
                
                print(f"  Found {len(tables)} tables on page")
                
                # Show a snippet of the page content
                print(f"  Content preview: {content_text[:200]}...")
                break
            else:
                print(f"  ✗ Redirected to help page")
                
        except Exception as e:
            print(f"  Error: {e}")
    
    print(f"\n=== FORM SUBMISSION ANALYSIS ===")
    # Let's examine what happens when we submit the form differently
    
    # Go back to the letter page
    driver.get(test_url)
    time.sleep(2)
    
    # Try different ways to submit the form
    dropdown = driver.find_element(By.NAME, "entryList")
    select_obj = Select(dropdown)
    
    # Method 1: Select and get the form parameters manually
    select_obj.select_by_index(1)  # Select first real option
    
    # Get the form element
    form = dropdown.find_element(By.XPATH, ".//ancestor::form")
    action = form.get_attribute('action')
    method = form.get_attribute('method') or 'GET'
    
    print(f"Form action: {action}")
    print(f"Form method: {method}")
    
    # Get all form data
    form_data = {}
    all_inputs = form.find_elements(By.TAG_NAME, "input")
    all_selects = form.find_elements(By.TAG_NAME, "select")
    
    for inp in all_inputs:
        name = inp.get_attribute('name')
        value = inp.get_attribute('value')
        if name:
            form_data[name] = value
    
    for sel in all_selects:
        name = sel.get_attribute('name')
        if name:
            selected_option = Select(sel).first_selected_option
            value = selected_option.get_attribute('value') if selected_option else ''
            form_data[name] = value
    
    print(f"Form data: {form_data}")
    
    # Try to construct the final URL manually
    if action:
        if method.upper() == 'GET':
            # Construct GET URL
            params = '&'.join([f"{k}={v}" for k, v in form_data.items() if v])
            manual_url = f"{action}?{params}"
            print(f"Manual URL: {manual_url}")
            
            driver.get(manual_url)
            time.sleep(2)
            result_url = driver.current_url
            result_title = driver.title
            print(f"Manual submission result: {result_url}")
            print(f"Manual submission title: {result_title}")
            
            if 'searchhelp' not in result_url:
                print("✓ Manual URL construction worked!")
            else:
                print("✗ Manual URL construction failed")
    
    driver.quit()
    
except Exception as e:
    print(f"Error in form analysis: {e}")
    try:
        driver.quit()
    except:
        pass

=== FORM ANALYSIS ===
Current URL: https://corpus.quran.com/qurandictionary.jsp?q=A
Found 0 forms

=== URL PATTERN ANALYSIS ===
First real option:
  Text: 'آزَر'
  Value: 'A%5Ezar'
  Decoded value: 'A^zar'

=== TESTING DIRECT URLs ===
Testing: https://corpus.quran.com/qurandictionary.jsp?word=A%5Ezar
  Result: https://corpus.quran.com/qurandictionary.jsp?word=A%5Ezar
  Title: The Quranic Arabic Corpus - Quran Dictionary
  ✓ SUCCESS! This might be the right pattern
  ✓ Found keyword 'آزَر' in page content!
  Found 4 tables on page
  Content preview:   Qur'an | Word by Word | Audio | Prayer Times
__ Sign In Search
  __
Quran Dictionary - آدَم
Word by Word
Quran Dictionary
Verb Concordance
Lemma Frequency
Morphological Search
English Translation
S...

=== FORM SUBMISSION ANALYSIS ===
Error in form analysis: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//ancestor::form"}
  (Session info: chrome=139.0.7258.128); For documentation on this error, pleas

In [25]:
# FINAL WORKING VERSION: Using correct URL pattern with word parameter
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# TESTING MODE: Set to True to test with only first 2 letters and 3 keywords each
TESTING_MODE = True
MAX_LETTERS_FOR_TESTING = 2
MAX_KEYWORDS_PER_LETTER = 3

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

try:
    driver = webdriver.Chrome(options=chrome_options)
    print("✅ Chrome browser started successfully!")
    
except Exception as e:
    print(f"❌ Error starting Chrome: {e}")
    raise

driver.get("https://corpus.quran.com/qurandictionary.jsp")
print("✅ Page loaded successfully!")

if TESTING_MODE:
    print(f"🧪 TESTING MODE: Will process {MAX_LETTERS_FOR_TESTING} letters with {MAX_KEYWORDS_PER_LETTER} keywords each")

time.sleep(3)
data = []

# Collect letter information
letter_data = []
all_links = driver.find_elements(By.TAG_NAME, "a")

for link in all_links:
    text = link.text.strip()
    href = link.get_attribute('href') or ''
    
    if (len(text) == 1 and 'q=' in href and ord(text) >= 0x0600):
        letter_data.append({'text': text, 'url': href})

if TESTING_MODE:
    letter_data = letter_data[:MAX_LETTERS_FOR_TESTING]

print(f"📚 Found {len(letter_data)} alphabet letters to process")

# Process each letter
for letter_idx, letter_info in enumerate(letter_data):
    letter_text = letter_info['text']
    letter_url = letter_info['url']
    
    print(f"\n{'='*60}")
    print(f"📖 Processing letter {letter_idx + 1}/{len(letter_data)}: {letter_text}")
    print(f"{'='*60}")

    try:
        driver.get(letter_url)
        time.sleep(3)
        
        print(f"🔗 Current URL: {driver.current_url}")

        # Find dropdown and collect keywords
        try:
            dropdown_element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.NAME, "entryList"))
            )
            dropdown = Select(dropdown_element)
            
            # Collect all option values and texts
            keyword_data = []
            for option in dropdown.options:
                keyword_text = option.text.strip()
                keyword_value = option.get_attribute('value')
                if keyword_text and keyword_value:
                    keyword_data.append({'text': keyword_text, 'value': keyword_value})
            
            # Remove empty first option if exists
            if keyword_data and not keyword_data[0]['text']:
                keyword_data = keyword_data[1:]
            
            print(f"📝 Found {len(keyword_data)} keywords for letter {letter_text}")
            
            # Limit keywords for testing
            if TESTING_MODE:
                keyword_data = keyword_data[:MAX_KEYWORDS_PER_LETTER]
                print(f"🧪 Testing mode: Limited to {len(keyword_data)} keywords")
            
            # Show keywords we'll process
            for i, kw in enumerate(keyword_data):
                print(f"   {i+1}. '{kw['text']}' (value: {kw['value']})")

            # Process each keyword using direct URL construction
            for i, keyword_info in enumerate(keyword_data):
                keyword_text = keyword_info['text']
                keyword_value = keyword_info['value']
                
                print(f"\n   → Processing keyword {i+1}/{len(keyword_data)}: '{keyword_text}'")
                
                try:
                    # Construct direct URL using the word parameter (this is the key!)
                    keyword_url = f"https://corpus.quran.com/qurandictionary.jsp?word={keyword_value}"
                    print(f"     🔗 URL: {keyword_url}")
                    
                    driver.get(keyword_url)
                    time.sleep(3)
                    
                    final_url = driver.current_url
                    page_title = driver.title
                    print(f"     📄 Final URL: {final_url}")
                    print(f"     📋 Title: {page_title}")
                    
                    # Check if we successfully reached the keyword page
                    if 'searchhelp.jsp' in final_url:
                        print(f"     ⚠️  Redirected to search help - skipping")
                        continue
                    
                    # Verify keyword is in page content
                    page_text = driver.page_source.lower()
                    if keyword_text.lower() not in page_text:
                        print(f"     ⚠️  Keyword not found in page - might be wrong page")
                    else:
                        print(f"     ✅ Keyword found in page content")
                    
                    # Scrape description
                    description = ""
                    try:
                        # Look for description in common locations
                        desc_selectors = [
                            f"//p[contains(text(), '{keyword_text}')]",
                            "//td[contains(text(), 'occurs') and contains(text(), 'time')]",
                            "//p[contains(text(), 'occurs') and contains(text(), 'time')]",
                            "//div[contains(@class, 'content')]//p[string-length(text()) > 20]",
                            "//h3[contains(text(), 'Proper noun')]/following-sibling::p"
                        ]
                        
                        for selector in desc_selectors:
                            try:
                                elements = driver.find_elements(By.XPATH, selector)
                                for elem in elements:
                                    text = elem.text.strip()
                                    if (text and 
                                        len(text) > 15 and 
                                        not text.startswith("You can") and
                                        not text.startswith("If you") and
                                        "search" not in text.lower() and
                                        "help" not in text.lower()):
                                        description = text
                                        break
                                if description:
                                    break
                            except:
                                continue
                                
                        if description:
                            print(f"     📝 Description: {description[:80]}...")
                        else:
                            print(f"     📝 Description: No meaningful description found")
                            
                    except Exception as desc_error:
                        print(f"     ❌ Description error: {desc_error}")

                    # Scrape ayahs with comprehensive approach
                    print(f"     🔍 Looking for ayah data...")
                    
                    occurrences = []
                    try:
                        # Find all tables on the page
                        tables = driver.find_elements(By.TAG_NAME, "table")
                        print(f"     📊 Found {len(tables)} tables")
                        
                        # Look for the data table (usually the largest one with verse references)
                        best_table = None
                        max_data_rows = 0
                        
                        for table_idx, table in enumerate(tables):
                            # Look for rows that contain verse references (format like "2:31")
                            data_rows = table.find_elements(By.XPATH, ".//tr[td[contains(text(), ':') and contains(translate(text(), '0123456789', ''), '')]]")
                            if len(data_rows) > max_data_rows:
                                max_data_rows = len(data_rows)
                                best_table = table
                        
                        if best_table:
                            print(f"     📋 Using table with {max_data_rows} data rows")
                            
                            # Extract data from the best table
                            data_rows = best_table.find_elements(By.XPATH, ".//tr[td[contains(text(), ':')]]")
                            
                            for row_idx, row in enumerate(data_rows):
                                try:
                                    cols = row.find_elements(By.TAG_NAME, "td")
                                    if len(cols) >= 3:
                                        ref_text = cols[0].text.strip()
                                        
                                        # Validate verse reference format (e.g., "2:31", "3:144")
                                        if (':' in ref_text and 
                                            any(c.isdigit() for c in ref_text) and 
                                            len(ref_text.split(':')) == 2):
                                            
                                            # Extract data based on number of columns
                                            if len(cols) >= 4:
                                                # 4+ columns: ref, variation, translation, arabic
                                                occurrence = {
                                                    "ref": ref_text,
                                                    "variation": cols[1].text.strip(),
                                                    "translation": cols[2].text.strip(),
                                                    "arabic": cols[3].text.strip()
                                                }
                                            else:
                                                # 3 columns: ref, translation, arabic
                                                occurrence = {
                                                    "ref": ref_text,
                                                    "variation": "",
                                                    "translation": cols[1].text.strip(),
                                                    "arabic": cols[2].text.strip()
                                                }
                                            
                                            # Only add if we have meaningful content
                                            if (occurrence["ref"] and 
                                                (occurrence["translation"] or occurrence["arabic"])):
                                                occurrences.append(occurrence)
                                                
                                                # Show first few for debugging
                                                if row_idx < 2:
                                                    print(f"       📜 Row {row_idx+1}: {ref_text} -> {occurrence['translation'][:40]}...")
                                                    
                                except Exception as row_error:
                                    print(f"       ❌ Row {row_idx} error: {row_error}")
                                    continue
                        else:
                            print(f"     ⚠️  No data table found")
                    
                    except Exception as table_error:
                        print(f"     ❌ Table search error: {table_error}")
                    
                    print(f"     📊 Found {len(occurrences)} ayah occurrences")
                    
                    # Store the data
                    entry = {
                        "letter": letter_text,
                        "keyword": keyword_text,
                        "description": description,
                        "occurrences": occurrences,
                        "url": keyword_url  # Store URL for reference
                    }
                    data.append(entry)

                    print(f"     ✅ Saved entry with {len(occurrences)} occurrences")
                    
                    # Save progress periodically
                    if len(data) % 5 == 0:
                        with open("quran_dictionary_progress.json", "w", encoding="utf-8") as f:
                            json.dump(data, f, ensure_ascii=False, indent=2)
                        print(f"     💾 Progress saved: {len(data)} total entries")
                    
                except Exception as keyword_error:
                    print(f"     ❌ Error processing keyword '{keyword_text}': {keyword_error}")
                    continue

        except Exception as dropdown_error:
            print(f"  ❌ No dropdown found for letter {letter_text}: {dropdown_error}")

    except Exception as letter_error:
        print(f"  ❌ Error processing letter {letter_text}: {letter_error}")
        continue

print(f"\n{'='*60}")
print(f"🎉 Scraping completed! Total entries: {len(data)}")
print(f"{'='*60}")

# Save final results
filename = "quran_dictionary_final.json"
with open(filename, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ Data saved to {filename}")

# Show detailed summary
if data:
    letters_processed = list(set(item['letter'] for item in data))
    total_occurrences = sum(len(item['occurrences']) for item in data)
    
    print(f"\n📊 FINAL SUMMARY:")
    print(f"   📚 Letters processed: {len(letters_processed)} ({', '.join(letters_processed)})")
    print(f"   📝 Total keywords: {len(data)}")
    print(f"   📜 Total ayah occurrences: {total_occurrences}")
    print(f"   📈 Average occurrences per keyword: {total_occurrences/len(data):.1f}")
    
    # Show sample entries
    print(f"\n📋 SAMPLE ENTRIES:")
    for i, entry in enumerate(data[:5]):
        desc_preview = entry['description'][:50] + "..." if entry['description'] else "No description"
        print(f"   {i+1}. Letter '{entry['letter']}' - Keyword '{entry['keyword']}'")
        print(f"      Description: {desc_preview}")
        print(f"      Occurrences: {len(entry['occurrences'])}")
        if entry['occurrences']:
            first_occurrence = entry['occurrences'][0]
            print(f"      First ayah: {first_occurrence['ref']} - {first_occurrence['translation'][:50]}...")
        print()
        
    print(f"📁 Complete data saved to: {filename}")
    
    if TESTING_MODE:
        print(f"\n🧪 TESTING COMPLETE")
        print(f"   To scrape all letters and keywords, set TESTING_MODE = False")
        print(f"   Then change MAX_LETTERS_FOR_TESTING to cover all 28 Arabic letters")
else:
    print("❌ No data was scraped. Please check the debugging output above.")

driver.quit()
print("🔒 Browser closed successfully!")

✅ Chrome browser started successfully!
✅ Page loaded successfully!
🧪 TESTING MODE: Will process 2 letters with 3 keywords each
📚 Found 2 alphabet letters to process

📖 Processing letter 1/2: أ
🔗 Current URL: https://corpus.quran.com/qurandictionary.jsp?q=A
📝 Found 90 keywords for letter أ
🧪 Testing mode: Limited to 3 keywords
   1. 'آدَم' (value: A%5Edam)
   2. 'آزَر' (value: A%5Ezar)
   3. 'أَبَارِيق' (value: %3EabaAriyq)

   → Processing keyword 1/3: 'آدَم'
     🔗 URL: https://corpus.quran.com/qurandictionary.jsp?word=A%5Edam
     📄 Final URL: https://corpus.quran.com/qurandictionary.jsp?word=A%5Edam
     📋 Title: The Quranic Arabic Corpus - Quran Dictionary
     ✅ Keyword found in page content
     📝 Description: No meaningful description found
     🔍 Looking for ayah data...
     📊 Found 4 tables
     ⚠️  No data table found
     📊 Found 0 ayah occurrences
     ✅ Saved entry with 0 occurrences

   → Processing keyword 2/3: 'آزَر'
     🔗 URL: https://corpus.quran.com/qurandictio

In [26]:
# TABLE STRUCTURE ANALYZER: Examine the exact HTML structure of the ayah data
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import re

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

try:
    driver = webdriver.Chrome(options=chrome_options)
    
    # Test with a specific keyword that should have data
    test_url = "https://corpus.quran.com/qurandictionary.jsp?word=A%5Edam"  # آدَم (Adam)
    print(f"🔍 Analyzing table structure for: {test_url}")
    driver.get(test_url)
    time.sleep(3)
    
    print(f"📄 Page title: {driver.title}")
    print(f"🔗 Final URL: {driver.current_url}")
    
    # Get all tables
    tables = driver.find_elements(By.TAG_NAME, "table")
    print(f"\n📊 Found {len(tables)} tables")
    
    for table_idx, table in enumerate(tables):
        print(f"\n--- TABLE {table_idx + 1} ---")
        
        # Get table attributes
        table_class = table.get_attribute('class') or 'no class'
        table_id = table.get_attribute('id') or 'no id'
        print(f"Class: {table_class}")
        print(f"ID: {table_id}")
        
        # Get all rows
        rows = table.find_elements(By.TAG_NAME, "tr")
        print(f"Rows: {len(rows)}")
        
        # Analyze first few rows in detail
        for row_idx, row in enumerate(rows[:5]):
            cols = row.find_elements(By.TAG_NAME, "td")
            ths = row.find_elements(By.TAG_NAME, "th")
            
            if cols:
                print(f"  Row {row_idx + 1} (TD): {len(cols)} columns")
                for col_idx, col in enumerate(cols):
                    text = col.text.strip()
                    col_class = col.get_attribute('class') or 'no class'
                    print(f"    Col {col_idx + 1}: class='{col_class}' text='{text[:50]}{'...' if len(text) > 50 else ''}'")
            elif ths:
                print(f"  Row {row_idx + 1} (TH): {len(ths)} header columns")
                for col_idx, th in enumerate(ths):
                    text = th.text.strip()
                    print(f"    Header {col_idx + 1}: '{text}'")
            else:
                print(f"  Row {row_idx + 1}: Empty row")
        
        if len(rows) > 5:
            print(f"  ... and {len(rows) - 5} more rows")
        
        # Look for verse references specifically
        verse_refs = table.find_elements(By.XPATH, ".//td[contains(text(), ':')]")
        if verse_refs:
            print(f"  🎯 Found {len(verse_refs)} cells with ':' (potential verse refs)")
            for i, ref_cell in enumerate(verse_refs[:3]):
                text = ref_cell.text.strip()
                print(f"    Ref {i + 1}: '{text}'")
    
    # Look for specific patterns that might indicate verse data
    print(f"\n🔍 PATTERN ANALYSIS")
    
    # Look for elements containing verse-like patterns (number:number)
    verse_pattern_elements = driver.find_elements(By.XPATH, "//*[contains(text(), ':') and contains(translate(text(), '0123456789', ''), '')]")
    print(f"📜 Found {len(verse_pattern_elements)} elements with verse-like patterns")
    
    for i, elem in enumerate(verse_pattern_elements[:10]):
        text = elem.text.strip()
        tag = elem.tag_name
        parent_tag = elem.find_element(By.XPATH, "..").tag_name if elem.find_element(By.XPATH, "..") else "unknown"
        
        # Check if this looks like a verse reference
        if re.match(r'^\d+:\d+', text):
            print(f"  ✅ Verse ref {i + 1}: {tag} -> '{text}' (parent: {parent_tag})")
        else:
            print(f"  ❓ Other {i + 1}: {tag} -> '{text[:30]}...' (parent: {parent_tag})")
    
    # Look for Arabic text specifically
    print(f"\n🕌 ARABIC TEXT ANALYSIS")
    arabic_elements = driver.find_elements(By.XPATH, "//*[contains(translate(text(), 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', ''), '')]")
    arabic_count = 0
    for elem in arabic_elements:
        text = elem.text.strip()
        if text and len(text) > 5:  # Skip short elements
            # Check if contains Arabic characters
            if any(ord(char) >= 0x0600 and ord(char) <= 0x06FF for char in text):
                arabic_count += 1
                if arabic_count <= 5:  # Show first 5
                    tag = elem.tag_name
                    parent_tag = elem.find_element(By.XPATH, "..").tag_name if elem.find_element(By.XPATH, "..") else "unknown"
                    print(f"  🕌 Arabic {arabic_count}: {tag} -> '{text[:50]}...' (parent: {parent_tag})")
    
    print(f"📊 Total Arabic text elements: {arabic_count}")
    
    # Show a larger section of page source for manual inspection
    print(f"\n📄 PAGE SOURCE ANALYSIS (relevant section)")
    page_source = driver.page_source
    
    # Find the section that contains table data
    if 'table' in page_source.lower():
        # Extract the main content area
        import re
        
        # Look for tables in the source
        table_matches = re.findall(r'<table[^>]*>.*?</table>', page_source, re.DOTALL | re.IGNORECASE)
        
        for i, table_html in enumerate(table_matches):
            if len(table_html) > 200:  # Only show substantial tables
                print(f"\n--- TABLE {i + 1} HTML (first 500 chars) ---")
                print(table_html[:500] + "...")
                
                # Look for verse patterns in this table
                verse_matches = re.findall(r'\d+:\d+', table_html)
                if verse_matches:
                    print(f"Verse references found: {verse_matches[:5]}")
    
    driver.quit()
    
except Exception as e:
    print(f"❌ Error in table analysis: {e}")
    try:
        driver.quit()
    except:
        pass

🔍 Analyzing table structure for: https://corpus.quran.com/qurandictionary.jsp?word=A%5Edam
📄 Page title: The Quranic Arabic Corpus - Quran Dictionary
🔗 Final URL: https://corpus.quran.com/qurandictionary.jsp?word=A%5Edam

📊 Found 4 tables

--- TABLE 1 ---
Class: toolbar
ID: no id
Rows: 1
  Row 1 (TD): 2 columns
    Col 1: class='toolbarLeft' text=''
    Col 2: class='toolbarRight' text='Qur'an | Word by Word | Audio | Prayer Times'

--- TABLE 2 ---
Class: pageTemplate
ID: no id
Rows: 44
  Row 1 (TD): 4 columns
    Col 1: class='logo1' text=''
    Col 2: class='pad' text='__'
    Col 3: class='userBox' text='Sign In'
    Col 4: class='searchBox' text='Search'
  Row 2 (TD): 2 columns
    Col 1: class='no class' text=''
    Col 2: class='leedsLogo' text=''
  Row 3 (TD): 3 columns
    Col 1: class='logoPad' text=''
    Col 2: class='pad' text='__'
    Col 3: class='title' text='Quran Dictionary - آدَم'
  Row 4 (TD): 92 columns
    Col 1: class='menuContainer' text='Word by Word
Quran Dict

In [28]:
# 🏆 FINAL COMPREHENSIVE QURAN DICTIONARY SCRAPER 🏆
# This scraper extracts all alphabet letters, keywords, descriptions, and ayah occurrences
# from corpus.quran.com/qurandictionary.jsp

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os
import re
from datetime import datetime

# CONFIGURATION
TESTING_MODE = True  # Set to False for full scraping
MAX_LETTERS_FOR_TESTING = 3
MAX_KEYWORDS_PER_LETTER = 5

# FULL SCRAPE SETTINGS (when TESTING_MODE = False)
SAVE_PROGRESS_INTERVAL = 10  # Save progress every N keywords
ADD_DELAYS = True  # Add delays to be respectful to the server

def setup_driver():
    """Setup Chrome driver with optimal settings"""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    try:
        driver = webdriver.Chrome(options=chrome_options)
        print("✅ Chrome browser started successfully!")
        return driver
    except Exception as e:
        print(f"❌ Error starting Chrome: {e}")
        raise

def extract_description(driver, keyword_text):
    """Extract keyword description from various possible locations"""
    description = ""
    
    desc_selectors = [
        # Look for proper noun definition
        "//h3[contains(text(), 'Proper noun')]/following-sibling::p",
        # Look for occurrence count descriptions
        "//p[contains(text(), 'occurs') and contains(text(), 'time')]",
        "//td[contains(text(), 'occurs') and contains(text(), 'time')]",
        # Look for general paragraphs with meaningful content
        "//div[contains(@class, 'content')]//p[string-length(text()) > 30]",
        "//p[string-length(text()) > 30 and not(contains(text(), 'search')) and not(contains(text(), 'help'))]",
        # Look for any text mentioning the keyword
        f"//p[contains(text(), '{keyword_text}')]"
    ]
    
    for selector in desc_selectors:
        try:
            elements = driver.find_elements(By.XPATH, selector)
            for elem in elements:
                text = elem.text.strip()
                if (text and 
                    len(text) > 20 and 
                    not text.startswith("You can") and
                    not text.startswith("If you") and
                    "search" not in text.lower() and
                    "help" not in text.lower() and
                    "enter" not in text.lower()):
                    description = text
                    break
            if description:
                break
        except Exception:
            continue
    
    return description

def extract_ayah_occurrences(driver):
    """Extract ayah occurrences using improved detection logic"""
    occurrences = []
    
    try:
        # Strategy 1: Look for elements with verse reference patterns
        verse_elements = driver.find_elements(By.XPATH, "//*[text()[contains(., ':')]]")
        
        # Group verse references with their surrounding data
        verse_data = []
        for elem in verse_elements:
            text = elem.text.strip()
            # Check if this looks like a verse reference (e.g., "2:31", "3:144")
            if re.match(r'^\d+:\d+$', text):
                # Get the parent row or container
                try:
                    parent_row = elem.find_element(By.XPATH, "./ancestor::tr[1]")
                    if parent_row:
                        cols = parent_row.find_elements(By.TAG_NAME, "td")
                        if len(cols) >= 2:  # Need at least ref + content
                            verse_data.append(cols)
                except:
                    continue
        
        # Strategy 2: If no clear rows found, look for structured tables
        if not verse_data:
            tables = driver.find_elements(By.TAG_NAME, "table")
            for table in tables:
                rows = table.find_elements(By.TAG_NAME, "tr")
                for row in rows:
                    cols = row.find_elements(By.TAG_NAME, "td")
                    if len(cols) >= 2:
                        first_col_text = cols[0].text.strip()
                        if re.match(r'^\d+:\d+$', first_col_text):
                            verse_data.append(cols)
        
        # Extract data from found verse rows
        for cols in verse_data:
            try:
                if len(cols) >= 2:
                    ref = cols[0].text.strip()
                    
                    # Determine structure based on number of columns
                    if len(cols) == 2:
                        # ref, arabic
                        occurrence = {
                            "ref": ref,
                            "variation": "",
                            "translation": "",
                            "arabic": cols[1].text.strip()
                        }
                    elif len(cols) == 3:
                        # ref, translation, arabic
                        occurrence = {
                            "ref": ref,
                            "variation": "",
                            "translation": cols[1].text.strip(),
                            "arabic": cols[2].text.strip()
                        }
                    elif len(cols) >= 4:
                        # ref, variation, translation, arabic
                        occurrence = {
                            "ref": ref,
                            "variation": cols[1].text.strip(),
                            "translation": cols[2].text.strip(),
                            "arabic": cols[3].text.strip()
                        }
                    
                    # Only add if we have meaningful content
                    if (occurrence["ref"] and 
                        (occurrence["translation"] or occurrence["arabic"]) and
                        len(occurrence["ref"]) < 10):  # Reasonable ref length
                        occurrences.append(occurrence)
                        
            except Exception as row_error:
                continue
                
    except Exception as e:
        print(f"       ❌ Ayah extraction error: {e}")
    
    return occurrences

def main():
    """Main scraping function"""
    print("🚀 Starting Quran Dictionary Comprehensive Scraper")
    print("=" * 60)
    
    if TESTING_MODE:
        print(f"🧪 TESTING MODE: {MAX_LETTERS_FOR_TESTING} letters, {MAX_KEYWORDS_PER_LETTER} keywords each")
    else:
        print(f"🌍 FULL SCRAPE MODE: All letters and keywords")
    
    print("=" * 60)
    
    # Setup
    driver = setup_driver()
    start_time = datetime.now()
    data = []
    
    try:
        # Navigate to main page
        driver.get("https://corpus.quran.com/qurandictionary.jsp")
        time.sleep(3)
        
        # Collect all Arabic letters
        print("📚 Collecting Arabic alphabet letters...")
        letter_data = []
        all_links = driver.find_elements(By.TAG_NAME, "a")
        
        for link in all_links:
            text = link.text.strip()
            href = link.get_attribute('href') or ''
            
            if (len(text) == 1 and 'q=' in href and ord(text) >= 0x0600):
                letter_data.append({'text': text, 'url': href})
        
        if TESTING_MODE:
            letter_data = letter_data[:MAX_LETTERS_FOR_TESTING]
        
        print(f"📖 Found {len(letter_data)} Arabic letters: {[l['text'] for l in letter_data]}")
        
        # Process each letter
        total_keywords = 0
        for letter_idx, letter_info in enumerate(letter_data):
            letter_text = letter_info['text']
            letter_url = letter_info['url']
            
            print(f"\n{'='*60}")
            print(f"📖 Letter {letter_idx + 1}/{len(letter_data)}: {letter_text}")
            print(f"{'='*60}")
            
            try:
                # Navigate to letter page
                driver.get(letter_url)
                time.sleep(2)
                
                # Get keywords for this letter
                dropdown_element = WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.NAME, "entryList"))
                )
                dropdown = Select(dropdown_element)
                
                # Collect keyword data
                keyword_data = []
                for option in dropdown.options:
                    keyword_text = option.text.strip()
                    keyword_value = option.get_attribute('value')
                    if keyword_text and keyword_value:
                        keyword_data.append({'text': keyword_text, 'value': keyword_value})
                
                # Remove empty first option if exists
                if keyword_data and not keyword_data[0]['text']:
                    keyword_data = keyword_data[1:]
                
                if TESTING_MODE:
                    keyword_data = keyword_data[:MAX_KEYWORDS_PER_LETTER]
                
                print(f"📝 Processing {len(keyword_data)} keywords for letter {letter_text}")
                
                # Process each keyword
                for kw_idx, keyword_info in enumerate(keyword_data):
                    keyword_text = keyword_info['text']
                    keyword_value = keyword_info['value']
                    total_keywords += 1
                    
                    print(f"\\n   📜 Keyword {kw_idx + 1}/{len(keyword_data)}: '{keyword_text}'")
                    
                    try:
                        # Construct and navigate to keyword URL
                        keyword_url = f"https://corpus.quran.com/qurandictionary.jsp?word={keyword_value}"
                        driver.get(keyword_url)
                        time.sleep(2)
                        
                        # Verify we're on the right page
                        if 'searchhelp.jsp' in driver.current_url:
                            print(f"      ⚠️  Redirected to search help - skipping")
                            continue
                        
                        # Extract description
                        description = extract_description(driver, keyword_text)
                        if description:
                            print(f"      📝 Description: {description[:80]}...")
                        else:
                            print(f"      📝 Description: Not found")
                        
                        # Extract ayah occurrences
                        occurrences = extract_ayah_occurrences(driver)
                        print(f"      📊 Found {len(occurrences)} ayah occurrences")
                        
                        # Show sample occurrences
                        if occurrences:
                            for i, occ in enumerate(occurrences[:2]):
                                trans_preview = occ['translation'][:40] + "..." if occ['translation'] else "No translation"
                                print(f"         {i+1}. {occ['ref']}: {trans_preview}")
                        
                        # Store data
                        entry = {
                            "letter": letter_text,
                            "keyword": keyword_text,
                            "description": description,
                            "occurrences": occurrences,
                            "url": keyword_url,
                            "scraped_at": datetime.now().isoformat()
                        }
                        data.append(entry)
                        
                        print(f"      ✅ Saved entry #{len(data)}")
                        
                        # Save progress periodically
                        if not TESTING_MODE and len(data) % SAVE_PROGRESS_INTERVAL == 0:
                            progress_file = "quran_dictionary_progress.json"
                            with open(progress_file, "w", encoding="utf-8") as f:
                                json.dump(data, f, ensure_ascii=False, indent=2)
                            print(f"      💾 Progress saved: {len(data)} entries")
                        
                        # Add delay for server respect
                        if ADD_DELAYS and not TESTING_MODE:
                            time.sleep(1)
                            
                    except Exception as keyword_error:
                        print(f"      ❌ Error processing '{keyword_text}': {keyword_error}")
                        continue
                
            except Exception as letter_error:
                print(f"  ❌ Error processing letter {letter_text}: {letter_error}")
                continue
        
        # Final results
        elapsed_time = datetime.now() - start_time
        print(f"\n{'='*60}")
        print(f"🎉 SCRAPING COMPLETED!")
        print(f"{'='*60}")
        
        # Save final results
        filename = "quran_dictionary_complete.json" if not TESTING_MODE else "quran_dictionary_test.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        # Statistics
        letters_processed = list(set(item['letter'] for item in data))
        total_occurrences = sum(len(item['occurrences']) for item in data)
        
        print(f"📊 FINAL STATISTICS:")
        print(f"   ⏱️  Time taken: {elapsed_time}")
        print(f"   📚 Letters processed: {len(letters_processed)} ({', '.join(letters_processed)})")
        print(f"   📝 Keywords processed: {len(data)}")
        print(f"   📜 Total ayah occurrences: {total_occurrences}")
        print(f"   📈 Average occurrences per keyword: {total_occurrences/len(data):.1f}" if data else "   📈 No data collected")
        print(f"   📁 Data saved to: {filename}")
        
        # Show sample entries
        if data:
            print(f"\\n📋 SAMPLE ENTRIES:")
            for i, entry in enumerate(data[:5]):
                print(f"   {i+1}. Letter '{entry['letter']}' - Keyword '{entry['keyword']}'")
                if entry['description']:
                    print(f"      📝 {entry['description'][:80]}...")
                print(f"      📊 {len(entry['occurrences'])} occurrences")
                if entry['occurrences']:
                    first_occ = entry['occurrences'][0]
                    print(f"      📜 Example: {first_occ['ref']} - {first_occ['translation'][:50]}...")
                print()
        
        success_rate = len(data) / max(total_keywords, 1) * 100
        print(f"✅ Success rate: {success_rate:.1f}% ({len(data)}/{total_keywords} keywords)")
        
        if TESTING_MODE:
            print(f"\\n🧪 TESTING COMPLETE")
            print(f"   To scrape ALL letters and keywords:")
            print(f"   1. Set TESTING_MODE = False")
            print(f"   2. Run this cell again")
            print(f"   3. Wait for full completion (may take 30+ minutes)")
        
    except Exception as main_error:
        print(f"❌ Main error: {main_error}")
    
    finally:
        driver.quit()
        print("🔒 Browser closed successfully!")

# Run the scraper
if __name__ == "__main__":
    main()

🚀 Starting Quran Dictionary Comprehensive Scraper
🧪 TESTING MODE: 3 letters, 5 keywords each
✅ Chrome browser started successfully!
📚 Collecting Arabic alphabet letters...
📖 Found 3 Arabic letters: ['أ', 'ب', 'ت']

📖 Letter 1/3: أ
📝 Processing 5 keywords for letter أ
\n   📜 Keyword 1/5: 'آدَم'
      📝 Description: Copyright © Kais Dukes, 2009-2017. Maintained by the quran.com team. This is an ...
      📊 Found 0 ayah occurrences
      ✅ Saved entry #1
\n   📜 Keyword 2/5: 'آزَر'
      📝 Description: Copyright © Kais Dukes, 2009-2017. Maintained by the quran.com team. This is an ...
      📊 Found 0 ayah occurrences
      ✅ Saved entry #2
\n   📜 Keyword 3/5: 'أَبَارِيق'
      📝 Description: Copyright © Kais Dukes, 2009-2017. Maintained by the quran.com team. This is an ...
      📊 Found 0 ayah occurrences
      ✅ Saved entry #3
\n   📜 Keyword 4/5: 'أ ب ب'
      📝 Description: Copyright © Kais Dukes, 2009-2017. Maintained by the quran.com team. This is an ...
      📊 Found 0 ayah occurre

# Complete Quranic Arabic Dictionary Scraper

This notebook scrapes the complete Quranic Arabic dictionary from corpus.quran.com including:
- All Arabic letters/alphabets
- All keywords for each letter from the dropdown
- Complete descriptions for each keyword
- All Quranic verses (ayahs) where each keyword appears
- Morphological variations of each keyword in Arabic and English

## Structure:
1. **Setup and Configuration**
2. **WebDriver Initialization** 
3. **Data Extraction Functions**
4. **Main Scraping Loop**
5. **Data Export and Analysis**