In [75]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the Purdue ECE Faculty directory
url = "https://engineering.purdue.edu/ECE/People/Faculty"

# Send a request to fetch the page content
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Lists to store extracted data
    names = []
    emails = []
    titles = []
    profile_urls = []

    # Find all faculty entries
    faculty_entries = soup.find_all("div", class_="list-info")

    for entry in faculty_entries:
        # Extract name and profile URL
        name_tag = entry.find("a")
        if name_tag:
            name = name_tag.text.strip().replace("\u00A0", " ")  # Replace non-breaking spaces with regular spaces
            profile_url = name_tag.get("href")
            names.append(name)
            profile_urls.append(profile_url)
        else:
            names.append("N/A")
            profile_urls.append("N/A")
        
        # Extract title
        title_tag = entry.find("div", class_="short-title")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"
        titles.append(title)
        
        # Extract email
        email_tag = entry.find("div", class_="email")
        email = email_tag.get_text(strip=True) if email_tag else "N/A"
        emails.append(email)

    # Create DataFrame
    df = pd.DataFrame({
        "Name": names,
        "Email": emails,
        "Title": titles,
        "Profile URL": profile_urls
    })
else:
    print("Failed to retrieve the page.")


In [76]:
def clean_text(text):
    return text.replace('"', "").replace(" (area chair)", "")

df['Name'] = df['Name'].apply(clean_text)

In [77]:
df

Unnamed: 0,Name,Email,Title,Profile URL
0,Hadiseh Alaeian,halaeian@purdue.edu,Assistant Professor of ECE and Physics and Ast...,https://engineering.purdue.edu/ECE/People/ptPr...
1,Muhammad Ashraful Alam,alam@purdue.edu,Jai N. Gupta Distinguished Professor of Electr...,https://engineering.purdue.edu/ECE/People/ptPr...
2,Dionysios Aliprantis,dionysios@purdue.edu,Professor of Electrical and Computer Engineering,https://engineering.purdue.edu/ECE/People/ptPr...
3,Jan P. Allebach,allebach@ecn.purdue.edu,Distinguished Professor Emeritus of Electrical...,https://engineering.purdue.edu/ECE/People/ptPr...
4,Joerg Appenzeller,appenzeller@purdue.edu,Barry M. and Patricia L. Epstein Professor of ...,https://engineering.purdue.edu/ECE/People/ptPr...
...,...,...,...,...
134,Peide Peter Ye,yep@purdue.edu,Richard J. and Mary Jo Schwartz Professor of E...,https://engineering.purdue.edu/ECE/People/ptPr...
135,Stanislaw H. Zak,zak@ecn.purdue.edu,Professor of Electrical and Computer Engineering,https://engineering.purdue.edu/ECE/People/ptPr...
136,Fengqing Maggie Zhu,zhu0@purdue.edu,Associate Professor of Electrical and Computer...,https://engineering.purdue.edu/ECE/People/ptPr...
137,Carla Zoltowski,cbz@purdue.edu,Associate Professor of Engineering Practice,https://engineering.purdue.edu/ECE/People/ptPr...


In [78]:
df.iloc[0]['Profile URL']

'https://engineering.purdue.edu/ECE/People/ptProfile?resource_id=242740'

In [79]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd
# data = {
#     "Name": ["Hadiseh Alaeian", "Muhammad Ashraful Alam", "Dionysios Aliprantis"],
#     "Email": ["halaeian@purdue.edu", "alam@purdue.edu", "dionysios@purdue.edu"],
#     "Title": [
#         "Assistant Professor of ECE and Physics and Astronomy",
#         "Jai N. Gupta Distinguished Professor of Electrical and Computer Engineering",
#         "Professor of Electrical and Computer Engineering"
#     ],
#     "Profile URL": [
#         "https://engineering.purdue.edu/ECE/People/ptProfile?resource_id=242740",
#         "https://engineering.purdue.edu/ECE/People/ptProfile?resource_id=3171",
#         "https://engineering.purdue.edu/ECE/People/ptProfile?resource_id=123456"
#     ]
# }
# df = pd.DataFrame(data)[:1]
# for index, row in df.iterrows():
#     url = row["Profile URL"]
#     response = requests.get(url)
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.content, 'html.parser')
#         bio_text=soup.find("div")
#         print(bio_text)
#     else:
#         print(f"Failed to retrieve page for {row['Name']}")


In [81]:
import tqdm as tqdm
extracted_data = []

for i, person in tqdm.tqdm(df.iterrows(),total=df.shape[0]):
    url = person["Profile URL"]
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract Degrees
        degrees_section = soup.find("div", class_="profile-degrees")
        degrees = []
        if degrees_section:
            for li in degrees_section.find_all("li"):
                degrees.append(li.get_text(strip=True))
        
        # Extract Research
        research_section = soup.find("h2", string="Research")
        research = "N/A"
        if research_section:
            research_paragraph = research_section.find_next("p", class_="profile-research")
            if research_paragraph:
                research = research_paragraph.get_text(strip=True)
        
        # Extract Areas of Interest
        areas_section = soup.find("h2", string="Areas of Interest")
        areas = []
        if areas_section:
            areas_list = areas_section.find_next("ul")
            if areas_list:
                for li in areas_list.find_all("li"):
                    areas.append(li.get_text(strip=True))
        
        # Append extracted information to the list
        extracted_data.append({
            "Name": person["Name"],
            "Email": person["Email"],
            "Title": person["Title"],
            "Degrees": degrees if degrees else "N/A",
            "Research": research,
            "Areas of Interest": areas if areas else "N/A",
            "Profile URL": person["Profile URL"]
        })
        
        # print(f"Profile for {person['Name']} extracted successfully.")
        
    except requests.ConnectionError:
        print(f"Connection error occurred for {person['Name']} at {url}.")
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred for {person['Name']}: {http_err}")
    except Exception as err:
        print(f"An error occurred for {person['Name']}: {err}")

# Create a DataFrame from the extracted data
extracted_df = pd.DataFrame(extracted_data)

extracted_df


100%|██████████| 139/139 [04:22<00:00,  1.89s/it]


Unnamed: 0,Name,Email,Title,Degrees,Research,Areas of Interest,Profile URL
0,Hadiseh Alaeian,halaeian@purdue.edu,Assistant Professor of ECE and Physics and Ast...,"[BS, University of Tehran, Iran, 2007, MS, Uni...","My research focuses on hybrid, scalable, and i...","[Fields and Optics, Microelectronics and Nanot...",https://engineering.purdue.edu/ECE/People/ptPr...
1,Muhammad Ashraful Alam,alam@purdue.edu,Jai N. Gupta Distinguished Professor of Electr...,"[BSEE, Bangladesh University of Engineering an...","Physics of electronic, optoelectronic, and bio...",[Microelectronics and Nanotechnology],https://engineering.purdue.edu/ECE/People/ptPr...
2,Dionysios Aliprantis,dionysios@purdue.edu,Professor of Electrical and Computer Engineering,"[Diploma, ECE, National Technical University o...","electric machines and drives, power systems, i...",[Power and Energy Systems(Area Chair)],https://engineering.purdue.edu/ECE/People/ptPr...
3,Jan P. Allebach,allebach@ecn.purdue.edu,Distinguished Professor Emeritus of Electrical...,"[BS, University of Delaware, 1972, MS, Princet...","Electronic imaging systems, image capture and ...","[Communications, Networking, Signal & Image Pr...",https://engineering.purdue.edu/ECE/People/ptPr...
4,Joerg Appenzeller,appenzeller@purdue.edu,Barry M. and Patricia L. Epstein Professor of ...,"[BS, Technical University Aachen, Germany, 198...",Device and transport physics of low-dimensiona...,[Microelectronics and Nanotechnology],https://engineering.purdue.edu/ECE/People/ptPr...
...,...,...,...,...,...,...,...
134,Peide Peter Ye,yep@purdue.edu,Richard J. and Mary Jo Schwartz Professor of E...,"[BS, Fudan University, Shanghai, China, 1988, ...","Semiconductor physics and devices, Nano-struct...",[Microelectronics and Nanotechnology],https://engineering.purdue.edu/ECE/People/ptPr...
135,Stanislaw H. Zak,zak@ecn.purdue.edu,Professor of Electrical and Computer Engineering,"[BEE, Warsaw University of Technology, 1974, M...","Control, optimization, nonlinear systems, neur...","[Automatic Controls, Communications, Networkin...",https://engineering.purdue.edu/ECE/People/ptPr...
136,Fengqing Maggie Zhu,zhu0@purdue.edu,Associate Professor of Electrical and Computer...,"[BS in Electrical Engineering, Purdue Universi...","Image processing and analysis, video compressi...","[Communications, Networking, Signal & Image Pr...",https://engineering.purdue.edu/ECE/People/ptPr...
137,Carla Zoltowski,cbz@purdue.edu,Associate Professor of Engineering Practice,,,,https://engineering.purdue.edu/ECE/People/ptPr...


In [88]:
extracted_df.to_pickle("Purdue_ECE_Profs.pkl")