### Web Crawler for Leetcode Intution 

In [1]:
import csv
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def get_problem_slugs(limit=100):
    url = "https://leetcode.com/graphql"
    headers = {
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }
    query = {
        "query": "\
        query problemsetQuestionList($categorySlug: String, $limit: Int, $skip: Int, $filters: QuestionListFilterInput) { \
          problemsetQuestionList: questionList( \
            categorySlug: $categorySlug \
            limit: $limit \
            skip: $skip \
            filters: $filters \
          ) { \
            total: totalNum \
            questions: data { \
              titleSlug \
            } \
          } \
        }",
        "variables": {
            "categorySlug": "",
            "skip": 0,
            "limit": limit,
            "filters": {}
        }
    }
    response = requests.post(url, headers=headers, json=query)
    data = response.json()
    slugs = [(q['titleSlug'], f"https://leetcode.com/problems/{q['titleSlug']}/") for q in data['data']['problemsetQuestionList']['questions']]
    return slugs

In [3]:
def login(driver, username, password):
    login_url = "https://leetcode.com/accounts/login/"
    driver.get(login_url)
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    username_field = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "id_login"))
    )
    password_field = driver.find_element(By.ID, "id_password")
    login_button = driver.find_element(By.ID, "signin_btn")

    username_field.send_keys(username)
    password_field.send_keys(password)

    # Pause here and wait for the user to manually complete the CAPTCHA
    input("Complete the CAPTCHA and then press Enter to continue...")

    print("Login successful")

In [4]:
problem_sets = get_problem_slugs(10)
for p in problem_sets:
    print(p, '/n')

('two-sum', 'https://leetcode.com/problems/two-sum/') /n
('add-two-numbers', 'https://leetcode.com/problems/add-two-numbers/') /n
('longest-substring-without-repeating-characters', 'https://leetcode.com/problems/longest-substring-without-repeating-characters/') /n
('median-of-two-sorted-arrays', 'https://leetcode.com/problems/median-of-two-sorted-arrays/') /n
('longest-palindromic-substring', 'https://leetcode.com/problems/longest-palindromic-substring/') /n
('zigzag-conversion', 'https://leetcode.com/problems/zigzag-conversion/') /n
('reverse-integer', 'https://leetcode.com/problems/reverse-integer/') /n
('string-to-integer-atoi', 'https://leetcode.com/problems/string-to-integer-atoi/') /n
('palindrome-number', 'https://leetcode.com/problems/palindrome-number/') /n
('regular-expression-matching', 'https://leetcode.com/problems/regular-expression-matching/') /n


In [5]:
def fetch_problem_details(slug, link, driver):
    problem_data = {
        "name": slug,
        "link": link,
        "topics": [],
        "has_slides": False,
        "slides_count": 0,
        "slides_pages": "n/a"
    }
    
    description_url = link + "description/"
    
    # Fetch the problem description page
    driver.get(description_url)
    try:
        # Wait for the page to load completely
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)  # wait for the page to load

        # Click the 'Topics' to expand the panel
        try:
            topics_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'inline-flex') and contains(text(), 'Topics')]"))
            )
            driver.execute_script("arguments[0].click();", topics_button)
            time.sleep(2)  # wait for the panel to expand

            # Extract topics
            topic_div = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='flex flex-wrap gap-1']"))
            )
            topics = [topic.text for topic in topic_div.find_elements(By.TAG_NAME, 'a')]
            problem_data['topics'] = topics
        except Exception as e:
            print(f"Error fetching topics for {slug}: {e}")
            problem_data['topics'] = []

    except Exception as e:
        print(f"Error fetching description data for {slug}: {e}")
    
    # Check for slides on the editorial page
    editorial_url = link + "editorial/"
    driver.get(editorial_url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)  # wait for the page to load

        slides = driver.find_elements(By.CSS_SELECTOR, "div.relative.mx-auto.mb-6.flex.flex-col.overflow-hidden.rounded-lg")
        if slides:
            problem_data['has_slides'] = True
            problem_data['slides_count'] = len(slides)
            slides_pages = []
            for slide in slides:
                try:
                    page_text = slide.find_element(By.XPATH, ".//div[contains(@class, 'absolute right-0') and contains(text(), '/')]").text
                    pages = page_text.split('/')[-1].strip()
                    slides_pages.append(pages)
                except Exception as e:
                    print(f"Error fetching slides pages for {slug}: {e}")
                    slides_pages.append('n/a')
            problem_data['slides_pages'] = ','.join(slides_pages)
    except Exception as e:
        print(f"Error checking slides for {slug}: {e}")

    return problem_data


In [6]:
def main():
    options = Options()
    options.headless = False  # Set to False to handle CAPTCHA manually
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--incognito")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    
    # Use the provided LeetCode credentials
    username = "Esther_Wang"
    password = "leetcodetestshuwang2024"
    
    login(driver, username, password)
    
    problem_slugs = get_problem_slugs(limit=2000)  # Fetching the first 100 problems
    data = []

    for i, (slug, link) in enumerate(problem_slugs, start=1):
        print(f"Fetching data for {i}. {slug}...")
        problem_data = fetch_problem_details(slug, link, driver)
        problem_data['number'] = i
        data.append(problem_data)
    
    # Close the browser
    driver.quit()
    
    # Save data to CSV
    csv_file = 'leetcode_intuition_visualization_statistics.csv'
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Number", "Problem Name", "Problem Link", "Topics", "Has Slides", "Slides Count", "Slides Pages"])
        for row in data:
            writer.writerow([row['number'], row['name'], row['link'], ', '.join(row['topics']), row['has_slides'], row['slides_count'], row['slides_pages']])
    
    print(f"Data saved to {csv_file}")

In [7]:
if __name__ == "__main__":
    main()

Login successful
Fetching data for 1. two-sum...
Fetching data for 2. add-two-numbers...
Fetching data for 3. longest-substring-without-repeating-characters...
Fetching data for 4. median-of-two-sorted-arrays...
Fetching data for 5. longest-palindromic-substring...
Fetching data for 6. zigzag-conversion...
Fetching data for 7. reverse-integer...
Fetching data for 8. string-to-integer-atoi...
Fetching data for 9. palindrome-number...
Fetching data for 10. regular-expression-matching...
Fetching data for 11. container-with-most-water...
Fetching data for 12. integer-to-roman...
Fetching data for 13. roman-to-integer...
Fetching data for 14. longest-common-prefix...
Fetching data for 15. 3sum...
Fetching data for 16. 3sum-closest...
Fetching data for 17. letter-combinations-of-a-phone-number...
Fetching data for 18. 4sum...
Fetching data for 19. remove-nth-node-from-end-of-list...
Fetching data for 20. valid-parentheses...
Fetching data for 21. merge-two-sorted-lists...
Fetching data for 