In [1]:
from bs4 import BeautifulSoup
import requests

# Make a GET request to the website
response = requests.get('https://cs.illinois.edu/about/people/all-faculty/jeffe')

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Create an empty set to store the headers and class divs
dom_set = set()

# Find all headers and class divs and add them to the set
for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
    dom_set.add(header.name)

for div in soup.find_all('div', {'class': True}):
    dom_set.add(div['class'][0])

# Print the set to check the results
print(dom_set)


{'input-group', 'roles', 'tile-list', 'title', 'col-12', 'w-100', 'hidden', 'blocki', 'office', 'extProfileAREA', 'site_name', 'flex', 'lower', 'd-none', 'collapse', 'white-box', 'h1', 'site_identification', 'col-lg-7', 'email', 'row', 'input-group-append', 'menucol', 'p-3', 'description', 'directory-profile', 'campus_wordmark', 'recent-post-photo', 'col-lg-5', 'h3', 'container-fluid', 'role', 'parent_name', 'col', 'col-md', 'col-md-auto', 'dropdown-menu', 'contact', 'phone', 'h2'}


In [101]:
from bs4 import BeautifulSoup, NavigableString
import requests

# Make a GET request to the website
response = requests.get('https://cs.illinois.edu/about/people/all-faculty/jeffe')

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Create an empty dictionary to store the headers and class divs and their descendants
dom_dict = {}

def get_descendants(elem):
    descendants = set()
    for child in elem.children:
        if isinstance(child, NavigableString):
            descendants.add(str(child))
        elif child.name and child.name.startswith('h') or child.name == 'div' and 'class' in child.attrs:
            break
        else:
            descendants.add(str(child))
            descendants |= get_descendants(child)
    return descendants


# Find all headers and class divs and their descendants and store them in the dictionary
for header in soup.find_all(['h2']):
    dom_dict[str(header)] = get_descendants(header)

for div in soup.find_all('div', {'class': True}):
    dom_dict[div['class'][0]] = get_descendants(div)

# Print the dictionary to check the results
#print(dom_dict)


In [8]:
from bs4 import BeautifulSoup, NavigableString
import requests

# Make a GET request to the website
response = requests.get('https://cs.illinois.edu/about/people/all-faculty/jeffe')

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Create a dictionary to store the headers and class divs and their descendants
dom_dict = {}

def get_descendants(elem):
    descendants = set()
    for child in elem.children:
        if isinstance(child, NavigableString):
            descendants.add(str(child))
        elif child.name and child.name.startswith('h') or child.name == 'div' and 'class' in child.attrs:
            break
        else:
            descendants.add(str(child))
            descendants |= get_descendants(child)
    return descendants

# Find all headers and class divs containing the word "research" and their descendants and group them together
for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
    header_text = header.get_text().strip()
    if 'research' in header_text or ('class' in header.attrs and any('research' in x.lower() for x in header['class'])):
        dom_dict.setdefault('Research', []).append(get_descendants(header))

for div in soup.find_all('div', {'class': True}):
    if 'research' in div['class'][0].lower():
        dom_dict.setdefault('Research', []).append(get_descendants(div))


# Print the dictionary to check the results
for key, value in dom_dict.items():
    print(key, ':')
    for elem in value:
        print(elem)


Research :
{'Explore visionary research conducted by world-renowned faculty.'}
{'Help ensure that Illinois continues to set a global standard for CS research and education.'}


In [10]:
import requests
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

# Send GET request to the webpage
url = 'https://cs.illinois.edu/about/people/all-faculty/jeffe'
response = requests.get(url)

# Parse HTML content using BeautifulSoup
html_content = response.content
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the header text using regular expressions
header_text = []
for header in soup.find_all(re.compile('^h[1-6]$')):
    header_text.append(header.text.strip())

# Convert the header text to a document-term matrix
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(header_text)

# Apply K-means clustering
k = 10
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

# Print the cluster labels
labels = km.labels_
for i in range(k):
    print("Cluster {}: {}".format(i, [header_text[j] for j in np.where(labels == i)[0]]))


Cluster 0: ['About', 'Admissions', 'Academics', 'Research', 'News', 'Broadening Participation', 'Help ensure that Illinois continues to set a global standard for CS research and education.', 'Give', 'Jeff Erickson', 'For More Information', 'Education', 'Biography', 'Teaching Statement', 'Research Statement', 'Research Interests', 'Research Areas', 'Selected Articles in Journals', 'Conferences Organized or Chaired', 'Teaching Honors', 'Research Honors', 'Recent Courses Taught', 'Related News', 'Software: It Is All In The Details']
Cluster 1: ['Grainger Engineering Investitures Honor Seven Distinguished CS Faculty']
Cluster 2: ['Explore visionary research conducted by world-renowned faculty.']
Cluster 3: ['Ready to apply? Your path to CS at Illinois begins here.', 'Ready to apply? Your path to CS at Illinois begins here.']
Cluster 4: ['Surprise Computer Science Proof Stuns Mathematicians']
Cluster 5: ['Illinois CS Hosts First-Of-Its-Kind NSF Workshop on Departmental Plans for Broadening 

In [23]:
from bs4 import BeautifulSoup

# assume that the html string is stored in a variable called 'html'
url = 'https://cs.illinois.edu/about/people/all-faculty/jeffe'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')

# find the education section
education_section = soup.find('h2', text='Education')
if education_section:
    # get the list of education items
    education_list = education_section.find_next('ul')
    if education_list:
        # extract each education item as a string
        education_items = [item.text for item in education_list.find_all('li')]
        print('Education:')
        for item in education_items:
            print('- ' + item)

# find the academic positions section
positions_section = soup.find('h2', text='Academic Positions')
if positions_section:
    # get the list of academic positions
    positions_list = positions_section.find_next('ul')
    if positions_list:
        # extract each academic position as a string
        positions_items = [item.text for item in positions_list.find_all('li')]
        print('\nAcademic Positions:')
        for item in positions_items:
            print('- ' + item)


Education:
- Ph.D., Computer Science, University of California, Berkeley, July 1996
- M.S., Information and Computer Science, University of California, Irvine, June 1992
- B.A., Computer Science and Mathematical Sciences (double major), Rice University, May 1987

Academic Positions:
- Sohaib and Sara Abbasi Professor, University of Illinois at Urbana-Champaign, 2020-present
- Professor, University of Illinois at Urbana-Champaign, 2010-present
- Associate Professor (tenured), University of Illinois at Urbana-Champaign, 2004-2010
- Assistant Professor, University of Illinois at Urbana-Champaign, 1998-2004


In [24]:
from bs4 import BeautifulSoup

def extract_section(html, section_title):
    soup = BeautifulSoup(html, 'html.parser')
    section = soup.find('h2', text=section_title)
    if section is None:
        return None
    data = section.find_next_sibling()
    if data is None:
        return None
    if data.name == 'ul':
        results = []
        for li in data.find_all('li'):
            results.append(li.get_text())
        return results
    else:
        return data.get_text()


In [26]:
# for the 'Education' section
education_section = extract_section(html, 'Education')

# for the 'Research Interests' section
research_interests_section = extract_section(html, 'Research Interests')

# for the 'Contact Information' section
contact_information_section = extract_section(html, 'Contact Information')

print(education_section)
print(research_interests_section)
print(contact_information_section)

['Ph.D., Computer Science, University of California, Berkeley, July 1996', 'M.S., Information and Computer Science, University of California, Irvine, June 1992', 'B.A., Computer Science and Mathematical Sciences (double major), Rice University, May 1987']
['Algorithms, data structures, and lower bounds', 'Computational and discrete geometry and topology']
None


In [30]:
import re
#url = 'https://cs.illinois.edu/about/people/all-faculty/jeffe'
#url = 'https://chemistry.illinois.edu/mikaelb'
response = requests.get(url)
html = response.content
# assuming you have the html stored in the variable 'html'

soup = BeautifulSoup(html, 'html.parser')

# define a list of regex patterns to match the headers you're interested in
patterns = ['Education', 'Research Interests', 'Contact Information']

# loop through all h2 tags and check if they match any of the patterns
for h2 in soup.find_all('h2'):
    for pattern in patterns:
        if re.search(pattern, h2.text):
            print(f"Found {pattern}: {h2.next_sibling}")


Found Education: <ul>
<li>Ph.D., Computer Science, University of California, Berkeley, July 1996</li>
<li>M.S., Information and Computer Science, University of California, Irvine, June 1992</li>
<li>B.A., Computer Science and Mathematical Sciences (double major), Rice University, May 1987</li>
</ul>
Found Research Interests: <ul>
<li>Algorithms, data structures, and lower bounds</li>
<li>Computational and discrete geometry and topology</li>
</ul>


In [6]:
import requests
from bs4 import BeautifulSoup

# specify the URL of the webpage to be scraped
for prof in profs:
    url = f"https://cs.illinois.edu/about/people/all-faculty/{prof}"

    try:
        # send a GET request to the webpage
        response = requests.get(url)
    except:
        print('Did not find:', url)
        continue

    # parse the HTML content of the webpage using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # extract the name, department, and university from the webpage title
    title = soup.head.find('title')
    if title:
        title_parts = title.text.strip().split(' | ')
        name = title_parts[0]
        department = title_parts[1]
        university = title_parts[2]
    else:
        print('No title tag found')
        name = ''
        department = ''
        university = ''

    # extract the email, phone number, and office location from the webpage
    contact_info = soup.find('div', class_='contact-info')
    if contact_info:
        email = contact_info.find('a', href='mailto:').text if contact_info.find('a', href='mailto:') else ''
        phone = contact_info.find('a', href=lambda href: href and href.startswith('tel:')).text.strip() if contact_info.find('a', href=lambda href: href and href.startswith('tel:')) else ''
        office = contact_info.find('a', href=lambda href: href and 'maps.google.com' in href).text.strip() if contact_info.find('a', href=lambda href: href and 'maps.google.com' in href) else ''
    else:
        print('No contact info found')
        email = ''
        phone = ''
        office = ''

    # create a string to hold the information of the new person
    person_info = f"Name: {name}\nDepartment: {department}\nUniversity: {university}\nEmail: {email}\nPhone: {phone}\nOffice: {office}\n\n"

    # find all the headers (h2 tags) on the page
    headers = soup.find_all('h2')

    # loop through each header and look for the corresponding information
    for header in headers:
        header_text = header.text.strip()
        if header_text == 'Education':
            # find the next sibling element and add it to the person_info string
            education_info = header.find_next_sibling()
            if education_info:
                person_info += f"Education: {education_info.text.strip()}\n"
            else:
                print('No education info found')
                person_info += f"Education: Not Found\n"
        elif header_text == 'Academic Positions':
            # find the next sibling element and add it to the person_info string
            position_info = header.find_next_sibling()
            if position_info:
                person_info += f"Academic Positions: {position_info.text.strip()}\n"
            else:
                print('No academic positions found')
                person_info += f"Academic Positions: Not Found\n"
        elif header_text == 'Research Interests':
            # find the next sibling element and add it to the person_info string
            interest_info = header.find_next_sibling()
            if interest_info:
                person_info += f"Research Interests: {interest_info.text.strip()}\n"
            else:
                print('No research interests found')
                person_info += f"Research Interests: Not Found\n"
        else:
            # ignore all other headers
            pass

    # write the person_info string
    with open('peopletest.txt', 'a') as file:
        file.write(person_info + "\n<----------------------------------------------------------------->\n")


No contact info found


In [39]:
import requests
from bs4 import BeautifulSoup
profs = ['jeffe']


def extract_title(soup):
    title = soup.head.find('title')
    if title:
        title_parts = title.text.strip().split(' | ')
        return title_parts[0], title_parts[1], title_parts[2]
    else:
        return None, None, None

def extract_contact_info(soup):
    email_div = soup.find('div', class_='email')
    email = email_div.a.text if email_div else ''
    phone_div = soup.find('div', class_='phone')
    phone = phone_div.text.strip() if phone_div else ''
    office_div = soup.find('div', class_='office')
    office = office_div.text.strip() if office_div else ''
    return email, phone, office

def extract_list_info(soup, header_text, label):
    h2 = soup.find('h2', text=header_text)
    if h2:
        ul = h2.find_next('ul')
        if ul:
            return [f"{label}: {li.text.strip()}" for li in ul.find_all('li')]
        else:
            return [f"{label}: Not Found"]
    else:
        return [f"{label}: Not Found"]

def extract_person_info(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    name, department, university = extract_title(soup)
    if name is None:
        print(f"Error: Could not extract name from {url}")
        return None

    email, phone, office = extract_contact_info(soup)

    headers = soup.find_all('h2')
    person_info = [f"Name: {name}", f"Department: {department}", f"University: {university}", f"Email: {email}", f"Phone: {phone}", f"Office: {office}"]
    current_header = None

    for header in headers:
        header_text = header.text.strip()
        ul = header.find_next('ul')
        if ul:
            items = [li.text.strip() for li in ul.find_all('li')]
            if header_text != current_header:
                person_info.append(header_text)
                current_header = header_text
            for item in items:
                person_info.append(f"{header_text}: {item}")

    return person_info


def write_person_info(person_info):
    with open('people.txt', 'a') as file:
        file.write('\n'.join(person_info) + "\n<----------------------------------------------------------------->\n")


for prof in profs:
    url =  'https://chemistry.illinois.edu/mikaelb'
    #url = f"https://cs.illinois.edu/about/people/all-faculty/{prof}"
    person_info = extract_person_info(url)
    if person_info is not None:
         write_person_info(person_info)

IndexError: list index out of range

In [5]:
import requests
from bs4 import BeautifulSoup
profs = ['jeffe']

def extract_title(soup):
    title = soup.head.find('title')
    if title:
        title_parts = title.text.strip().split(' | ')
        if len(title_parts) >= 3:
            return title_parts[0], title_parts[1], title_parts[2]
    return None, None, None

def extract_contact_info(soup):
    email_div = soup.find('div', class_='email')
    email = email_div.a.text if email_div else ''
    phone_div = soup.find('div', class_='phone')
    phone = phone_div.text.strip() if phone_div else ''
    office_div = soup.find('div', class_='office')
    office = office_div.text.strip() if office_div else ''
    return email, phone, office

def extract_list_info(soup, headers):
    info = {}
    for header_text, label in headers:
        h2 = soup.find('h2', text=header_text)
        if h2:
            ul = h2.find_next('ul')
            if ul:
                info[label] = [li.text.strip() for li in ul.find_all('li')]
            else:
                info[label] = None
        else:
            info[label] = None
    return info

def extract_person_info(url, headers):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    name, department, university = extract_title(soup)
    if name is None:
        print(f"Error: Could not extract name from {url}")
        return None

    email, phone, office = extract_contact_info(soup)

    info = extract_list_info(soup, headers)

    person_info = [f"Name: {name}", f"Department: {department}", f"University: {university}", f"Email: {email}", f"Phone: {phone}", f"Office: {office}"]

    for label, value in info.items():
        if value is not None:
            person_info.append(f"{label}:")
            person_info.extend([f"\t- {v}" for v in value])

    return person_info

def write_person_info(person_info):
    with open('peopletest.txt', 'a') as file:
        file.write('\n'.join(person_info) + "\n<----------------------------------------------------------------->\n")


headers = [
    ("Education", "Education"),
    ("Academic Positions", "Academic Positions"),
    ("Research Interests", "Research Interests"),
    ("Selected Publications", "Selected Publications"),
    ("Honors", "Honors and Awards"),
    ("Service", "Professional Service"),
]


for prof in profs:
    url =  'https://chemistry.illinois.edu/mikaelb'
    #url = f"https://cs.illinois.edu/about/people/all-faculty/{prof}"
    person_info = extract_person_info(url, headers)
    if person_info is not None:
        write_person_info(person_info)


Error: Could not extract name from https://chemistry.illinois.edu/mikaelb


In [None]:
import requests
from bs4 import BeautifulSoup
import re

# cs_prof
#profs = ['jeffe', 'hanj', 'zaher']

# chem_prof
profs = ['mikaelb', 'mdburke', 'agewirth']
def extract_title(soup):
    title = soup.head.find('title')
    if title:
        title_parts = title.text.strip().split(' | ')
        if len(title_parts) >= 2:
            name = title_parts[0]
            department = title_parts[1].split(' - ')[0]
            university = 'University of Illinois at Urbana-Champaign'
            return name, department, university
    h1 = soup.find('h1', class_='page-title')
    if h1:
        name = h1.text.strip()
        department_div = soup.find('div', class_='field--name-field-primary-department')
        department = department_div.text.strip() if department_div else None
        university = 'University of Illinois at Urbana-Champaign'
        return name, department, university
    return None, None, None


def extract_contact_info(soup):
    email = ''
    email_div = soup.find('div', class_='email')
    if email_div:
        email = email_div.a.text.strip()
    else:
        email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', str(soup))
        if email_match:
            email = email_match.group(0)
    
    phone_div = soup.find('div', class_='phone')
    phone = phone_div.text.strip() if phone_div else ''
    if not phone:
        phone_tags = soup.find_all('a', href=re.compile(r'^tel:'))
        if phone_tags:
            phone = phone_tags[0].text.strip()
    
    office_div = soup.find('div', class_='office')
    office = office_div.text.strip() if office_div else ''

    # If office is not found, get all <br> tags below 'Contact Information'
    if not office:
        contact_info = soup.find('h2', text='Contact Information').find_next('div')
        office_tags = contact_info.find_all('br')
        office = ' '.join([tag.next_sibling.strip() for tag in office_tags if tag.next_sibling])
    
    return email, phone, office

def extract_list_info(soup, headers):
    info = {}
    for header_text, label in headers:
        h2 = soup.find('h2', text=header_text)
        if h2:
            ul = h2.find_next('ul')
            if ul:
                items = [li.text.strip() for li in ul.find_all('li') if 'Additional resources' not in li.text]
                info[label] = items if items else None
            else:
                info[label] = None
        else:
            h2 = soup.find('h2', class_='profile-label', text='Research Interests')
            if h2:
                p = h2.find_next_sibling('div').find('p')
                if p:
                    info['Research Areas'] = [p.text.strip()]
                else:
                    p = h2.find_next_sibling('p')
                    if p:
                        info['Research Areas'] = [p.text.strip()]
                    else:
                        info['Research Areas'] = None
            else:
                info[label] = None
    return info

def extract_person_info(url, headers):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    name, department, university = extract_title(soup)
    if name is None:
        print(f"Error: Could not extract name from {url}")
        return []

    email, phone, office = extract_contact_info(soup)

    info = extract_list_info(soup, headers)

    person_info = [f"Name: {name}", f"Department: {department}", f"University: {university}", f"Email: {email}", f"Phone: {phone}", f"Office: {office}"]

    for label, value in info.items():
        if value is not None:
            person_info.append(f"{label}:")
            person_info.extend([f"\t- {v}" for v in value])

    return person_info

def write_person_info(person_info):
    if person_info == "invalid_url":
        with open('people.txt', 'a') as file:
            file.write("invalid_url\n")
    else:
        with open('people.txt', 'a') as file:
            file.write('\n'.join(person_info) + "\n<----------------------------------------------------------------->\n")


headers = [
("Education", "Education"),
("Research Areas", "Research Areas"),
("Selected Publications", "Selected Publications"),
("Selected Talks", "Selected Talks")
]

for prof in profs:
    url = f"https://chemistry.illinois.edu/{prof}"
    #url = f"https://cs.illinois.edu/about/people/all-faculty/{prof}"
    person_info = extract_person_info(url, headers)
    if person_info:
        write_person_info(person_info)
    else:
        print(f"Error: Could not extract person info from {url}")
