Part 1


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the base URL and the directory page URL for faculty members
main_url = "https://physics.gatech.edu/people/professors"
base_url = "https://physics.gatech.edu"

# Send a GET request to the base url page and parse the HTML
response = requests.get(main_url)
soup = BeautifulSoup(response.content, 'html.parser')

## Save soup to inspect
with open('main_url_html_parser.html', 'wb') as file:  # create a file as named
  file.write(soup.prettify('utf-8')) # write a pretty version of the soup into d file wt utf encoding

In [2]:
# Extract links to each faculty member's profile page
faculty_links = [
    base_url + a['href'] for a in soup.select('h3.p-name a[href]')
]

# print faculty links
print(faculty_links)

['https://physics.gatech.edu/user/david-ballantyne', 'https://physics.gatech.edu/user/shiladitya-banerjee', 'https://physics.gatech.edu/user/tamara-bogdanovi%C4%87', 'https://physics.gatech.edu/user/laura-cadonati', 'https://physics.gatech.edu/user/michael-chapman', 'https://physics.gatech.edu/user/jennifer-curtis', 'https://physics.gatech.edu/user/predrag-cvitanovi%C4%87', 'https://physics.gatech.edu/user/dragomir-davidovic', 'https://physics.gatech.edu/user/walter-de-heer', 'https://physics.gatech.edu/user/chunhui-du', 'https://physics.gatech.edu/user/flavio-fenton', 'https://physics.gatech.edu/user/phillip-first', 'https://physics.gatech.edu/user/daniel-goldman', 'https://physics.gatech.edu/user/roman-grigoriev', 'https://physics.gatech.edu/user/jc-gumbart', 'https://physics.gatech.edu/user/zhigang-jiang', 'https://physics.gatech.edu/user/brian-kennedy', 'https://physics.gatech.edu/user/harold-kim', 'https://physics.gatech.edu/user/itamar-kimchi', 'https://physics.gatech.edu/user/it

In [3]:
# Extract the names of the faculty members
names = [a.get_text(strip=True) for a in soup.select('h3.p-name a')]

# print names
print(names)

['David Ballantyne', 'Shiladitya Banerjee', 'Tamara Bogdanović', 'Laura Cadonati', 'Michael Chapman', 'Jennifer Curtis', 'Predrag Cvitanović', 'Dragomir Davidovic', 'Walter de Heer', 'Chunhui Rita Du', 'Flavio Fenton', 'Phillip First', 'Daniel Goldman', 'Roman Grigoriev', 'JC Gumbart', 'Zhigang Jiang', 'Brian Kennedy', 'Harold Kim', 'Itamar Kimchi', 'Itamar Kolvin', 'Uzi Landman', 'Gongjie Li', 'Zhu-Xi Luo', 'Martin Maldovan', 'Elisabetta Matsumoto', 'Martin Mourigal', 'A. Nepomuk Otte', 'Feryal Özel', 'Colin V Parker', 'Dimitrios Psaltis', 'Michael Pustilnik', 'Chandra Raman', 'D.  Zeb Rocklin', 'Carlos Sa de Melo', 'Surabhi Sachdev', 'Michael Schatz', 'Audrey Sederberg', 'Simon Sponberg', 'Ignacio Taboada', 'Rick Trebino', 'Hailong Wang', 'Xueda Wen', 'Kurt Wiesenfeld', 'John Wise', 'Peter Yunker', 'Andrew Zangwill']


In [4]:
# faculty_response = requests.get("https://physics.gatech.edu/user/david-ballantyne")
# faculty_soup = BeautifulSoup(faculty_response.content, 'html.parser')

# ## Save soup to inspect
# with open('faculty_url_html_parser.html', 'wb') as file:  # create a file as named
#   file.write(faculty_soup.prettify('utf-8')) # write a pretty version of the soup into d file wt utf encoding

In [5]:
import time

# Define a function to retrieve the research information of each faculty member given their URL
def get_research(faculty_link):
    try:
        # Make a GET request to the faculty profile page and parse the response
        faculty_response = requests.get(faculty_link)
        faculty_soup = BeautifulSoup(faculty_response.content, 'html.parser')

        # Find the research header and retrieve the text in the following <p> tags
        research_section = faculty_soup.find('h3', string="Research")
        if research_section:
            research_texts = [p.get_text(strip=True) for p in research_section.find_next_siblings('p')]
            research = ' '.join(research_texts)
        else:
            research = "Research information not found"

        # Slow down to avoid overwhelming the server
        time.sleep(1)

        return research

    except requests.exceptions.RequestException as e:
        return "Error retrieving research information"

# Retrieve the research information for each faculty member
research_texts = [get_research(link) for link in faculty_links]

print(research_texts)

['high-energy astrophysics; accretion disks; galaxy and black hole evolution My research concentrates on topics in high-energy astrophysics with an emphasis on interpreting existing data and making predictions for future observations. My main interests are: (i) the evolution of galaxies and their central supermassive black holes and (ii) the physics of accretion disks around both black holes and neutron stars. My work often involves comparing computer based models with published data from X-ray, radio and/or infrared telescopes. All of these directions are guided by my underlying research philosophy of using any necessary tool or technique, unlimited by wavelength range or physical process, to make progress in understanding the relevant physics. Students working with me will be exposed to a wide range of astrophysical processes and theories, but will also learn to respect and be guided by observational constraints. Papers Since Arriving at Georgia Tech  ', 'Physics of Living Systems: C

In [6]:
# Combine names, links, and research information into a DataFrame
d1 = pd.DataFrame({
    "Name": names,
    "Profile URL": faculty_links,
    "Research": research_texts
})

# Display the DataFrame
d1.head()

Unnamed: 0,Name,Profile URL,Research
0,David Ballantyne,https://physics.gatech.edu/user/david-ballantyne,high-energy astrophysics; accretion disks; gal...
1,Shiladitya Banerjee,https://physics.gatech.edu/user/shiladitya-ban...,Physics of Living Systems: Cell and Tissue mec...
2,Tamara Bogdanović,https://physics.gatech.edu/user/tamara-bogdano...,My research interests are in astrophysics of s...
3,Laura Cadonati,https://physics.gatech.edu/user/laura-cadonati,I joined the Center for Relativistic Astrophys...
4,Michael Chapman,https://physics.gatech.edu/user/michael-chapman,"Contemporary quantum mechanics, manipulating t..."
