## Step 1: Install & Import Dependencies
**Documentation**: First, we need to ensure the necessary libraries are installed. We rely heavily on selenium for browser automation.
1. selenium: Controls the Chrome browser.
2. logging: Helps us track progress and errors.
3. pathlib: Creates cross-platform file paths (Windows/Mac/Linux).

In [None]:
# Install selenium if you haven't already
# !pip install selenium

import time
import re
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

# Configure logging to show timestamps and messages
# force=True ensures it reloads config if Jupyter has already started logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    datefmt='%H:%M:%S',
    handlers=[logging.StreamHandler()],
    force=True 
)

print(" The needed Libraries have been imported successfully.")

## Step 2: Define the Scraper Class
**Documentation**:
This is the core engine. I have encapsulated all the logic into the StrataScratchScraper class.
Key Features included here: <br>
2.1. restart_driver(): This is called automatically to prevent the "Invalid Session ID" crash by clearing browser memory. <br>
2.2.scrape_question_details(): Contains the Accordion Fix. It finds the "More about this question" button, scrolls to it, and clicks it to reveal the hidden Obsidian properties (Companies, Job Titles, etc.).<br>
2.3. create_markdown(): Formats the data into a clean Markdown file with a YAML frontmatter block for Obsidian.

In [None]:
class StrataScratchScraper:
    def __init__(self, output_dir: str = "StrataScratch_Full_DB", headless: bool = True):
        self.base_url = "https://platform.stratascratch.com"
        self.questions_url = f"{self.base_url}/coding?code_type=1"
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.headless = headless
        
        # Internal state
        self.driver = None
        self.wait = None
        
        # Initialize the first driver
        self.restart_driver()
    
    def restart_driver(self):
        """Closes current driver and starts a fresh one to clear memory."""
        if self.driver:
            try: self.driver.quit()
            except: pass
            logging.info(" ‚ö°Restarting Chrome Driver because there is more to life than getting a deadlock...")
            time.sleep(2)

        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument("--headless")
        
        # Settings to ensure stability and visibility of elements
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 20)
    
    def slugify(self, text: str) -> str:
        """Helper to make safe filenames"""
        if not text: return f"question-{int(time.time())}"
        text = re.sub(r'[\(\[].*?[\)\]]', '', text).lower()
        text = re.sub(r'[^a-z0-9]+', '-', text).strip('-')
        return text[:200]

    def get_total_pages(self) -> int:
        """Detects the total number of pages in the pagination bar."""
        logging.info("üîé Detecting total number of pages...")
        try:
            self.driver.get(f"{self.questions_url}&page=1")
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(5) 
            
            # Find generic pagination buttons (usually just numbers)
            buttons = self.driver.find_elements(By.XPATH, "//button[string-length(text()) < 3]")
            page_nums = [int(b.text.strip()) for b in buttons if b.text.strip().isdigit()]
            
            if page_nums:
                max_page = max(page_nums)
                logging.info(f"‚úÖ Detected {max_page} pages.")
                return max_page
            return 15 # Fallback
        except Exception as e:
            logging.warning(f"‚ö†Ô∏è Error detecting pages: {e}. Defaulting to 15.")
            return 15

    def extract_question_links(self, page_num: int) -> List[str]:
        """Extracts question URLs from a listing page with retry logic."""
        url = f"{self.questions_url}&page={page_num}"
        for attempt in range(2):
            try:
                self.driver.get(url)
                try:
                    self.wait.until(EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/coding/')]")))
                except TimeoutException: pass
                
                time.sleep(4) # Wait for React render
                
                links = self.driver.find_elements(By.CSS_SELECTOR, "a[href*='/coding/']")
                valid_links = list(set([l.get_attribute('href') for l in links if '/coding/' in l.get_attribute('href')]))
                
                if valid_links: return valid_links
                
                logging.warning(f"   ‚ö†Ô∏è Page {page_num} returned 0 links (Attempt {attempt+1}). Refreshing...")
                time.sleep(2)
            except Exception as e:
                logging.error(f"   Error on page {page_num}: {e}")
        return []

    def scrape_question_details(self, url: str) -> Optional[Dict]:
        """Scrapes detailed data including hidden properties."""
        self.driver.get(url)
        self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        
        # --- THE ACCORDION FIX ---
        try:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)
            accordion_btn = self.driver.find_element(By.XPATH, "//button[contains(., 'More about this question')]")
            if accordion_btn.get_attribute("aria-expanded") != "true":
                self.driver.execute_script("arguments[0].click();", accordion_btn)
                time.sleep(1)
        except: pass # Ignore if button is missing (some questions might not have it)

        # --- DATA EXTRACTION ---
        title = self.driver.find_element(By.TAG_NAME, "h1").text.strip()
        
        try: q_id = self.driver.find_element(By.XPATH, "//span[contains(text(), 'ID')]").text.replace("ID", "").strip()
        except: q_id = "Unknown"

        try:
            diff_elem = self.driver.find_element(By.CSS_SELECTOR, "[class*='QuestionDifficulty--']")
            difficulty = "Unknown"
            if "Easy" in diff_elem.text: difficulty = "Easy"
            elif "Medium" in diff_elem.text: difficulty = "Medium"
            elif "Hard" in diff_elem.text: difficulty = "Hard"
        except: difficulty = "Unknown"

        try: question_text = self.driver.find_element(By.CLASS_NAME, "QuestionMetadata__question").text.strip()
        except: question_text = ""

        # Extract Pills (Companies, Topics)
        def get_pills(header):
            try:
                xpath = f"//h3[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{header.lower()}')]/following-sibling::div[1]//div[contains(@class, 'QuestionMetadata__pill')]"
                elements = self.driver.find_elements(By.XPATH, xpath)
                return [e.text.strip() for e in elements if e.text.strip()]
            except: return []

        # Extract Tables
        tables = []
        try:
            t_names = self.driver.find_elements(By.CSS_SELECTOR, ".QuestionPane__question-data--tables")
            t_schemas = self.driver.find_elements(By.CLASS_NAME, "DatasetTableTypes__container")
            for i, t in enumerate(t_names):
                schema = {}
                if i < len(t_schemas):
                    spans = t_schemas[i].find_elements(By.TAG_NAME, "span")
                    for k in range(0, len(spans)-1, 2):
                        schema[spans[k].text.strip().rstrip(':')] = spans[k+1].text.strip()
                tables.append({'name': t.text.strip(), 'schema': schema})
        except: pass

        return {
            'title': title, 'id': q_id, 'difficulty': difficulty,
            'question': question_text, 'url': url,
            'companies': get_pills("Companies"), 
            'job_positions': get_pills("Job Positions"),
            'topic_family': get_pills("Topic Family"), 
            'topic_functions': get_pills("Topic Functions"),
            'tables': tables
        }

    def create_markdown(self, data: Dict) -> str:
        """Generates Obsidian-friendly Markdown."""
        md = "---\n"
        md += f"id: \"{data['id']}\"\n"
        md += f"title: \"{data['title'].replace('\"', '')}\"\n"
        md += f"difficulty: {data['difficulty']}\n"
        md += f"url: {data['url']}\n"
        
        def write_list(key, vals):
            if not vals: return ""
            res = f"{key}:\n"
            for v in vals: res += f"  - \"{v.replace('\"', '\\\"')}\"\n"
            return res

        md += write_list("companies", data['companies'])
        md += write_list("job_positions", data['job_positions'])
        md += write_list("topic_family", data['topic_family'])
        md += write_list("topic_functions", data['topic_functions'])
        md += "---\n\n"
        md += f"# {data['title']}\n\n"
        md += "## Question\n" + f"{data['question']}\n\n"
        
        if data['tables']:
            md += "## Database Schema\n"
            for t in data['tables']:
                md += f"### {t['name']}\n"
                if t['schema']:
                    md += "| Column | Type |\n|---|---|\n"
                    for c, d in t['schema'].items(): md += f"| {c} | {d} |\n"
                    md += "\n"
        return md
    
    def close(self):
        if self.driver: self.driver.quit()

print(" The Class has been defined successfully.")

## Step 3: Initialize the Scraper
**Documentation**:
Here we set up the output directory.
Action: Change output_dir to your preferred folder (or your Obsidian vault path).
Action: Keep headless=True for speed, or False if you want to watch it work.

In [None]:
# Initialize scraper
# CHANGE THIS PATH to where you want the files saved
OUTPUT_PATH = "StrataScratch_Notebook_Export" 

scraper = StrataScratchScraper(
    output_dir=OUTPUT_PATH, 
    headless=True 
)

print(f" The Scraper has been initialized. Saving to: {scraper.output_dir.absolute()}")

## Step 4: Phase 1 - Collect All Links
**Documentation**:
This step loops through the pages and collects the URLs.<br>
It automatically detects if there are 14, 15, or more pages.<br>
It iterates through them and collects all unique question links.<br>
It removes duplicates.<br>
*This takes about 1-2 minutes.*

In [None]:
# 1. Detect Total Pages
total_pages = scraper.get_total_pages()
all_links = []

print(f"Collecting links from {total_pages} pages...")

# 2. Loop through pages
for p in range(1, total_pages + 1):
    print(f"   Scanning Page {p}/{total_pages}...", end="\r")
    links = scraper.extract_question_links(p)
    
    if links:
        all_links.extend(links)
    
    # Restart driver every 5 pages during collection to ensure valid session
    if p % 5 == 0:
        scraper.restart_driver()

# Remove duplicates
all_links = list(set(all_links))

print(f"\n Good news, Collection Complete. We Found {len(all_links)} unique questions.")

## Step 5: Phase 2 - Scrape Data (The Heavy Lifting)
Documentation:
This is the main loop that visits every single question page.
Safety Features Active:
1. Memory Management: Every 25 questions, it restarts the browser automatically.
2. Crash Recovery: If a specific question crashes the browser (Invalid Session ID), it catches the error, restarts the browser, and retries that question once before moving on.
This step will take time (approx 25-35 minutes for 666 questions).

In [None]:
scraped_count = 0
failed_count = 0

print(" The Actual Scraping Starts now...")

for i, link in enumerate(all_links):
    
    # --- PREVENTIVE MAINTENANCE ---
    # Restart driver every 25 questions to clear RAM
    if i > 0 and i % 25 == 0:
        scraper.restart_driver()
    
    # Progress indicator
    print(f"[{i+1}/{len(all_links)}] Processing...", end="\r")
    
    try:
        # Attempt 1
        data = scraper.scrape_question_details(link)
        
        if data:
            fname = f"{scraper.slugify(data['title'])}.md"
            with open(scraper.output_dir / fname, 'w', encoding='utf-8') as f:
                f.write(scraper.create_markdown(data))
            logging.info(f"[{i+1}] Saved: {fname}")
            scraped_count += 1
        else:
            failed_count += 1
            
    except WebDriverException:
        # CRASH HANDLER
        logging.error(f"[{i+1}] Oh no the Browser has crashed! Restarting and retrying...")
        scraper.restart_driver()
        try:
            # Attempt 2 (Retry)
            data = scraper.scrape_question_details(link)
            if data:
                fname = f"{scraper.slugify(data['title'])}.md"
                with open(scraper.output_dir / fname, 'w', encoding='utf-8') as f:
                    f.write(scraper.create_markdown(data))
                logging.info(f"[{i+1}] Saved (after retry): {fname}")
                scraped_count += 1
            else:
                failed_count += 1
        except Exception as e:
            logging.error(f"[{i+1}] ‚ùå Retry failed: {e}")
            failed_count += 1
            
    except Exception as e:
        logging.error(f"[{i+1}] ‚ùå Error: {e}")
        failed_count += 1

print("\n" + "=" * 60)
print(f" Damn brody we actually Scraped: {scraped_count} | Unfortunately we Failed on: {failed_count}")
print("=" * 60)

## Step 6: Cleanup
Documentation:
Always good practice to close the browser driver when finished to free up system resources.
Cleanup is for the weak minded, real men consume resources and blame it on AWS

In [None]:
scraper.close()
print("‚úÖ Driver closed.")