In [2]:
#Scrape the 'bio_urls' list to .txt file

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the Selenium web driver 
driver = webdriver.Chrome()

# Navigate to the faculty page
url = "https://www.cs.stanford.edu/people/faculty"
driver.get(url)

# Create empty lists to store bio URLs
bio_urls = []

# Find and store professor links
professor_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'https://www.cs.stanford.edu/people/')]")
professor_urls = [link.get_attribute("href") for link in professor_links]

# Iterate through the professor URLs and scrape 'bio_urls' data
for professor_url in professor_urls:
    driver.get(professor_url)

    # Store the bio URL
    bio_urls.append(driver.current_url)

    # Navigate to the bio URL
    driver.get(driver.current_url)
    
# Write the data to separate text files
with open("bio_urls.txt", "w") as bio_urls_file:
    bio_urls_file.write("\n".join(bio_urls))

In [12]:
#Scrape the 'bios' list to .txt file

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Set up the Selenium web driver
driver = webdriver.Chrome()

# Load the bio URLs from the file
with open("bio_urls.txt", "r") as bio_urls_file:
    bio_urls = bio_urls_file.read().splitlines()

# Create an empty list to store bios
bios = []

# Iterate through each bio URL and scrape bio information
for bio_url in bio_urls:
    driver.get(bio_url)

    try:
        # Find and extract the bio information
        bio_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='su-person-body su-wysiwyg-text node stanford-person body text-with-summary label-hidden']"))
        )

        # Extract the text from the bio element
        bio_text = bio_element.text.strip() if bio_element else None

        # Append the bio to the 'bios' list
        bios.append(bio_text)
    except TimeoutException:
        # Handle the case where the element is not found
        print(f"Bio element not found on the page: {bio_url}")
        bios.append(None)  # Append None to indicate no bio found

# Filter out None values and write the bios to a text file
bios = [bio for bio in bios if bio is not None]
with open("bios.txt", "w", encoding="utf-8") as bios_file:
    bios_file.write("\n".join(bios))


Bio element not found on the page: https://www.cs.stanford.edu/people/sara-achour
Bio element not found on the page: https://www.cs.stanford.edu/people/nima-anari
Bio element not found on the page: https://www.cs.stanford.edu/people/emma-brunskill
Bio element not found on the page: https://www.cs.stanford.edu/people/noah-goodman
Bio element not found on the page: https://www.cs.stanford.edu/people/tatsunori-hashimoto
Bio element not found on the page: https://www.cs.stanford.edu/people/sanmi-koyejo


In [4]:
#Scrape the 'courses_taught' list to .txt file

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the Selenium web driver
driver = webdriver.Chrome()

# Load the bio URLs from the file
with open("bio_urls.txt", "r") as bio_urls_file:
    bio_urls = bio_urls_file.read().splitlines()

# Create a function to scrape teaching information
def scrape_teaching_info(url):
    try:
        driver.get(url)
        # Find and click the "View Full Stanford Profile" button
        view_profile_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, "View Full Stanford Profile"))
        )
        view_profile_button.click()

        # Wait for the "Teaching" tab to appear and click it
        teaching_tab_xpath = "/html/body/div/main/div/div/section[2]/div/div[1]/div[2]/ul/li[2]/a"
        teaching_tab_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, teaching_tab_xpath))
        )
        teaching_tab_element.click()

        # Wait for the "2022-23 Courses" element to be visible
        courses_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.XPATH, "//h3[text()='2022-23 Courses']/following-sibling::ul[@class='section-listing courses']"))
        )

        # Extract the text from the courses element
        courses_text = courses_element.text.strip() if courses_element else None

        # Check if there is a "Prior Year Courses" tab
        try:
            prior_year_courses_tab_xpath = "//*[@id='coursesContent']/div/ul/li[5]/a"
            prior_year_courses_tab_element = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, prior_year_courses_tab_xpath))
            )
            prior_year_courses_tab_element.click()

            # Wait for the "Prior Year Courses" text to be visible
            prior_year_courses_text_element = WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CLASS_NAME, "prior-year-courses-label"))
            )
            prior_year_courses_text = prior_year_courses_text_element.text.strip()
        except:
            prior_year_courses_text = None

        # Find and scrape all text data under '2022-23 Courses'
        courses_content_xpath = "//*[@id='coursesContent']/div/ul"
        courses_content_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, courses_content_xpath))
        )
        courses_content_text = courses_content_element.text.strip() if courses_content_element else None

        return courses_text, prior_year_courses_text, courses_content_text
    except Exception as e:
        return None, None, None

# Iterate through each bio URL and scrape teaching information
for bio_url in bio_urls:
    courses_text, prior_year_courses_text, courses_content_text = scrape_teaching_info(bio_url)

    if courses_text:
        # Write the data to a text file
        with open("courses_taught.txt", "a", encoding="utf-8") as f:
            f.write(courses_text + "\n\n")

    if prior_year_courses_text:
        # Append the Prior Year Courses information to the file
        with open("courses_taught.txt", "a", encoding="utf-8") as f:
            f.write(prior_year_courses_text + "\n\n")

    if courses_content_text:
        # Append the content under '2022-23 Courses' to the file
        with open("courses_taught.txt", "a", encoding="utf-8") as f:
            f.write(courses_content_text + "\n\n")

# Close the web driver
driver.quit()
