In [94]:
import pandas as pd
import os
import time
import urllib.request
import base64
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [95]:
def read_csv_file(file_path):
    """
    Read the CSV file and return a pandas DataFrame.
    
    :param file_path: str, path to the CSV file
    :return: pd.DataFrame, DataFrame containing the CSV data
    """
    df = pd.read_csv(file_path)
    return df

def filter_problems_with_slides(df):
    """
    Filter the DataFrame to include only problems with slides.
    
    :param df: pd.DataFrame, DataFrame containing the CSV data
    :return: pd.DataFrame, DataFrame containing only problems with slides
    """
    filtered_df = df[df['Has Slides'] == True]
    return filtered_df

def extract_problem_info(df, num_problems=300):
    """
    Extract information for the specified number of problems with slides.
    
    :param df: pd.DataFrame, DataFrame containing the CSV data
    :param num_problems: int, number of problems to select
    :return: list of tuples, each containing (problem name, problem link, slides count, slides pages)
    """
    selected_df = df.head(num_problems)
    problem_info = []
    
    for index, row in selected_df.iterrows():
        problem_name = row['Problem Name']
        problem_link = row['Problem Link']
        slides_count = row['Slides Count']
        slides_pages = row['Slides Pages']
        
        problem_info.append((problem_name, problem_link, slides_count, slides_pages))
    
    return problem_info


In [96]:
file_path = 'leetcode_intuition_visualization_statistics.csv'
df = read_csv_file(file_path)
problems_with_slides_df = filter_problems_with_slides(df)
problem_info = extract_problem_info(problems_with_slides_df, len(problems_with_slides_df))
problems_with_slides_df.head(5)
print(len(problem_info))

400


In [97]:
def get_file_content_chrome(driver, uri):
  result = driver.execute_async_script("""
    var uri = arguments[0];
    var callback = arguments[1];
    var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i[c]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);for(c=0;t-t%3>c;c+=3,o+=4)r=n[c]<<16|n[c+1]<<8|n[c+2],a[o]=i[r>>18],a[o+1]=i[r>>12&63],a[o+2]=i[r>>6&63],a[o+3]=i[63&r];return t%3===1?(r=n[t-1],a[o]=i[r>>2],a[o+1]=i[r<<4&63],a[o+2]=61,a[o+3]=61):t%3===2&&(r=(n[t-2]<<8)+n[t-1],a[o]=i[r>>10],a[o+1]=i[r>>4&63],a[o+2]=i[r<<2&63],a[o+3]=61),new TextDecoder("ascii").decode(a)};
    var xhr = new XMLHttpRequest();
    xhr.responseType = 'arraybuffer';
    xhr.onload = function(){ callback(toBase64(xhr.response)) };
    xhr.onerror = function(){ callback(xhr.status) };
    xhr.open('GET', uri);
    xhr.send();
    """, uri)
  if type(result) == int :
    raise Exception("Request failed with status %s" % result)
  return base64.b64decode(result)

In [98]:
def download_slides(problem_name, problem_link, driver):
    """
    Download all slides for the given problem.
    
    :param problem_name: str, name of the problem
    :param problem_link: str, link to the problem
    :param driver: selenium.webdriver, the WebDriver instance
    """
    editorial_link = problem_link + 'editorial/'
    driver.get(editorial_link)
    
    # Wait for the editorial page to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(10)  # Adjust sleep time as necessary
    
    # Check if the slides are present
    try:
        slides = driver.find_elements(By.CSS_SELECTOR, "div.relative.mx-auto.mb-6.flex.flex-col.overflow-hidden.rounded-lg")
        if not slides:
            print(f"No slides found for problem: {problem_name}")
            return
        
        # print(f"{len(slides)} slides in problem {problem_name}")
        
        # Create a base directory for the problem slides
        base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'slides'))
        problem_dir = os.path.join(base_dir, problem_name)
        
        # TODO only contain the first slides
        for i, elem in enumerate(slides[0:1]):
            # get the pages of each slides
            page_text = elem.find_element(By.XPATH, ".//div[contains(@class, 'absolute right-0') and contains(text(), '/')]").text
            pages = int(page_text.split('/')[-1].strip())

            # create folder and download the first page of each slides
            img_elem = elem.find_element(By.CSS_SELECTOR, ".object-fit-contain")
            resource_url = img_elem.get_attribute('src') or img_elem.get_attribute('data-src')
            if resource_url:
                bytes_content = get_file_content_chrome(driver, resource_url)
                
                # Create a directory for each slide
                slide_dir = os.path.join(problem_dir, str(i+1))
                os.makedirs(slide_dir, exist_ok=True)
                
                # Save the slide image
                slide_filename = os.path.join(slide_dir, f'page_{1:02}.png')
                with open(slide_filename, 'wb') as file:
                    file.write(bytes_content)
                # print(f"Downloaded page 1 of slide {i+1} for problem: {problem_name}")
            
            # download the following pages of the slide
            print(f"downloading {problem_name} with pages {pages}")
            for j in range(1, pages):
                # print(j, pages)

                # print(f'page_{j}: try to get next_button')
                
                # Updated XPath to target the svg element containing the path with specific d attribute
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "div.flex.items-center.space-x-7 > svg:nth-child(3)"))
                )

                next_button.click()

                # driver.execute_script("arguments[0].click();", next_button)
                time.sleep(1)  # Adjust sleep time as necessary
                
                img_elem = elem.find_element(By.CSS_SELECTOR, ".object-fit-contain")
                resource_url = img_elem.get_attribute('src') or img_elem.get_attribute('data-src')
                if resource_url:
                    bytes_content = get_file_content_chrome(driver, resource_url)
                    
                    # Save the slide image
                    slide_filename = os.path.join(slide_dir, f'page_{j+1:02}.png')
                    print(slide_filename)
                    with open(slide_filename, 'wb') as file:
                        file.write(bytes_content)
                    # print(f"Downloaded page {j+1} of slide {i+1} for problem: {problem_name}")

    except Exception as e:
        print(f"Error downloading slides for problem: {problem_name}. Error: {e}")

In [99]:
def login(driver, username, password):
    login_url = "https://leetcode.com/accounts/login/"
    driver.get(login_url)
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    username_field = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "id_login"))
    )
    password_field = driver.find_element(By.ID, "id_password")
    login_button = driver.find_element(By.ID, "signin_btn")

    username_field.send_keys(username)
    password_field.send_keys(password)

    # Pause here and wait for the user to manually complete the CAPTCHA
    input("Complete the CAPTCHA and then press Enter to continue...")

    print("Login successful")

In [100]:
# example test
options = Options()
options.headless = False  # Set to False to handle CAPTCHA manually
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
options.add_argument("--disable-popup-blocking")
options.add_argument("--incognito")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

problem_name = "median-of-two-sorted-arrays"
problem_link = "https://leetcode.com/problems/median-of-two-sorted-arrays/"

# Use the provided LeetCode credentials
username = "Esther_Wang"
password = "leetcodetestshuwang2024"
    
login(driver, username, password)

counter = 1

for problem_name, problem_link, slides_count, slides_page in problem_info:
    download_slides(problem_name, problem_link, driver)
    print(f"Finish download slides from {problem_name}, {counter}/{len(problem_info)}")
    counter += 1
    
driver.quit()

Login successful
downloading median-of-two-sorted-arrays with pages 12
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_02.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_03.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_04.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_05.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_06.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_07.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_08.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides/median-of-two-sorted-arrays/1/page_09.png
/Users/wangshu/Desktop/ETHz/Thesis/intuition-visualisation/slides

In [101]:
# import time
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# options = Options()
# options.headless = False  # Set to False to handle CAPTCHA manually
# options.add_argument("--no-sandbox")
# options.add_argument("--disable-dev-shm-usage")
# options.add_argument("--disable-gpu")
# options.add_argument("--disable-extensions")
# options.add_argument("--disable-popup-blocking")
# options.add_argument("--incognito")

# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=options)

# # Open the webpage
# driver.get('https://leetcode.com/problems/median-of-two-sorted-arrays/editorial/')

# try:
#     # Wait for the next button to be clickable and then click it
#     next_button = WebDriverWait(driver, 10).until(
#         EC.element_to_be_clickable((By.CSS_SELECTOR, "div.flex.items-center.space-x-7 > svg:nth-child(3)"))
#     )

#     # Click the next button
#     for i in range(10):
#         next_button.click()
#         time.sleep(5)

# finally:
#     # Close the WebDriver
#     time.sleep(60)
#     driver.quit()
print(f'{12:02}')

12
