In [619]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [648]:
def save_to_file(text, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write("\n\n".join(text))
        print("Text has been written to the file successfully.")
    except Exception as e:
        print(f"Error occurred while writing to the file: {e}")

In [649]:
def save_dictionary_to_file(data, file_path):
    try:
        with open(file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)
            print("Text has been written to the file successfully.")
    except Exception as e:
        print(f"Error occurred while writing to the file: {e}")

In [748]:
def scrape_data_from(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    bio_urls = {}
    courses_taught = {}
    bios = {}
    names = []
    
    # Find all elements with the specified class
    all_entries = soup.find_all("div", class_="col-sm-10")

    for entry in all_entries:
        # Find the anchor element within the div with the class "directory-name"
        anchor_element = entry.find("div", class_="directory-name").find("a")

        # Extract the bio URL from the 'href' attribute of the anchor element
        if anchor_element:
            bio_url = anchor_element.get("href")
            
            # Check if the URL contains the desired substrings
            if "http://homes.cs.washington.edu/" in bio_url or "https://www.cs.washington.edu/people/faculty" in bio_url:
                # Now, let's scrape data from the bio URL
                bio_page = requests.get(bio_url)
                bio_soup = BeautifulSoup(bio_page.content, 'html.parser')
                name = bio_soup.title.text.strip().split("|")[0] if bio_soup.title else "No Name"
                bio_urls[name] = bio_url
 
                # Extract the content under the "Teaching" section (if available)
                teaching_header = bio_soup.find("h3", text="Teaching")
                if teaching_header:
                    courses_names = []
                    teaching_content_element = teaching_header.find_next_sibling("table", class_="table teaching")
                    if teaching_content_element:
                        # Find all rows in the table body
                        rows = teaching_content_element.find_all("tr")
                        course_quarters = []
                        for row in rows[1:]:
                            # Get the course name and quarters
                            cols = row.find_all("th")
                            course_name = cols[0].text.strip()[:]
                            print(course_name)
                            courses_names.append(course_name)
                    else:
                        courses_names.append("Teaching content not found.")
                        
                    courses_taught[name] = courses_names

                else:
                    courses_taught[name] = []

                # Extract the biography content from the biography URL
                biography_header = bio_soup.find('div', class_='field-item')
                if biography_header:
                    # Find the next sibling after the <h3>Biography</h3> header
                    biography_content_element = biography_header.find_next_sibling("p")
                  
                    if biography_content_element:
                        # Get the biography text
                        biography_text = biography_content_element.text.strip()
                        bios[name] = biography_text
                    else:
                        bios[name] = "Biography content not found."
                else:
                    bios[name] = "Biography section not found."
            
    
    return bio_urls, courses_taught, bios

In [749]:
url = "https://www.cs.washington.edu/people/faculty"
bio_urls, courses_taught, bios = scrape_data_from(url)
print(bio_urls)

CSE 332: Data Structures & Parallelism
CSE 391: System and Software Tools
CSE 160: Data Programming
CSE 351: The Hardware/Software Interface
CSE General TA Training
CSE 590E: Computer Science Education Seminar
{'Richard Anderson ': 'https://www.cs.washington.edu/people/faculty/anderson/', 'Ruth Anderson': 'http://homes.cs.washington.edu/~rea', 'Yejin Choi': 'http://homes.cs.washington.edu/~yejin/', 'Dieter Fox ': 'http://homes.cs.washington.edu/~fox', 'Dan Grossman': 'http://homes.cs.washington.edu/~djg/', 'Jeffrey Heer': 'http://homes.cs.washington.edu/~jheer/', 'Justin Hsia': 'http://homes.cs.washington.edu/~jhsia/', 'Tadayoshi Kohno (aka Yoshi Kohno)': 'http://homes.cs.washington.edu/~yoshi/', "Shayan Oveis Gharan's homepage": 'http://homes.cs.washington.edu/~shayan/', 'Anup Rao ': 'https://www.cs.washington.edu/people/faculty/anuprao/', 'No Name': 'http://homes.cs.washington.edu/~seitz/', 'Linda Shapiro': 'http://homes.cs.washington.edu/~shapiro/', 'Joshua Smith ': 'https://www.cs.

In [750]:
#save the result into the files
save_dictionary_to_file(bio_urls, 'bio_urls.txt')
save_dictionary_to_file(courses_taught, 'courses_taught.txt')
save_dictionary_to_file(bios, 'bios.txt')


Text has been written to the file successfully.
Text has been written to the file successfully.
Text has been written to the file successfully.
