In [None]:
import json
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from gensim.test.utils import common_texts
from concurrent.futures import ThreadPoolExecutor

In [36]:
data = []
base_url = "https://researchops.web.illinois.edu"

# Scape Main Page
def scrape_main_page(page_num):
    url = base_url + "/?page=" + str(page_num)
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table_rows = soup.find_all('tr')
        for row in table_rows:
            first_td = row.find('td', headers="view-title-table-column")
            if first_td:
                topic_element = first_td.find('a')
                topic = topic_element.text.strip() if topic_element else None
                detail_url = base_url + topic_element['href'] if topic_element else None  # 詳細頁面 URL
                description = first_td.text.strip().replace(topic, "").strip() if topic else None

                columns = row.find_all('td')[1:]
                other_data = [column.text.strip() for column in columns]
                
                row_data = [topic, description, detail_url] + other_data
                data.append(row_data)


for i in range(0, 9):  
    scrape_main_page(i)

columns = ["Topic", "Description", "Detail URL", "Research Area", "Opportunity Timing", "Deadline Date"]
research_opps = pd.DataFrame(data, columns=columns)

full_data = []

# sraping webpage
def scrape_detail_page(row):
    detail_url = row['Detail URL']
    detail_response = requests.get(detail_url)
    if detail_response.status_code == 200:
        detail_soup = BeautifulSoup(detail_response.content, 'html.parser')

        # extract details like sponsoring_institution, location, etc.
        def extract_field(label):
            """根據標籤名稱提取對應數據"""
            label_div = detail_soup.find('div', class_="field__label", string=label)
            if label_div:
                value_div = label_div.find_next_sibling('div', class_="field__item")
                return value_div.text.strip() if value_div else None
            return None

        sponsoring_institution = extract_field("Sponsoring Institution")
        location = extract_field("Location")
        duration = extract_field("Duration")
        compensation = extract_field("Compensation")
        citizenship_requirement = extract_field("Citizenship Requirement")

        # Combine details of research opportunities
        full_data.append({
            "Topic": row["Topic"],
            "Description": row["Description"],
            "Detail URL": detail_url,
            "Research Area": row["Research Area"],
            "Opportunity Timing": row["Opportunity Timing"],
            "Deadline Date": row["Deadline Date"],
            "Sponsoring Institution": sponsoring_institution,
            "Location": location,
            "Duration": duration,
            "Compensation": compensation,
            "Citizenship Requirement": citizenship_requirement,
        })

with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(scrape_detail_page, [row for _, row in research_opps.iterrows()])

# to DataFrame
final_df = pd.DataFrame(full_data)
final_df.to_csv("research_opportunities_full.csv", index=False)


In [37]:
filter_data = pd.read_csv("research_opportunities_full.csv")
filter_data

Unnamed: 0,Topic,Description,Detail URL,Research Area,Opportunity Timing,Deadline Date,Sponsoring Institution,Location,Duration,Compensation,Citizenship Requirement
0,Watershed Management Research and Outreach Und...,This program will provide students with broade...,https://researchops.web.illinois.edu/opportuni...,"Agriculture & Food Sciences, Natural Sciences,...",Summer,Anticipated 12/8/24,North Central Region Water Network,Various,10 weeks,"> $10,000","US Citizen, National, or Permanent Resident re..."
1,Dryland Sustainability Research (IRES),This International Research Experiences for St...,https://researchops.web.illinois.edu/opportuni...,"Agriculture & Food Sciences, Humanities & Arts...",Summer,12/15/24,"Cornell University, Princeton University, Notr...","Nanyuki, Kenya",7 weeks,,"US Citizen, National, or Permanent Resident re..."
2,MindCORE Summer Fellowship Program at UPenn,This fellowship provides students with a 10-we...,https://researchops.web.illinois.edu/opportuni...,"Medicine & Health, Social Sciences & Behavior",Summer,Anticipated 12/31/24,University of Pensylvania,"Philadelphia, PA",10 weeks,"$6,000","US Citizen, National, or Permanent Resident re..."
3,PINNACLE - Renewables and Cybersecurity for th...,"In this eight-week program, students will gain...",https://researchops.web.illinois.edu/opportuni...,Science & Technology,Summer,12/15/24,Virginia Polytechnic Institute and State Unive...,"Grenoble, France",8 weeks,"$4,400","US Citizen, National, or Permanent Resident re..."
4,Injury Prevention Center Summer Internships,The internships are designed for students inte...,https://researchops.web.illinois.edu/opportuni...,Medicine & Health,Summer,Anticipated 12/17/24,University of Michigan,"Ann Arbor, MI",10 weeks,"$5,700",No Citizenship Requirements
...,...,...,...,...,...,...,...,...,...,...,...
220,Congressional Hispanic Caucus Insititute (CHCI),CIP provides Latino undergraduates of all majo...,https://researchops.web.illinois.edu/opportuni...,Social Sciences & Behavior,"Fall, Summer",Anticipated 12/1/25,Congressional Hispanic Caucus Institute,"Washington, DC",10 weeks,"$3,125","US Citizen, National, or Permanent Resident re..."
221,Northwestern University Minority Health and He...,The T37 provides health disparities research ...,https://researchops.web.illinois.edu/opportuni...,Medicine & Health,Summer,Rolling,Northwestern University,"Evanston, IL",12 weeks,"$3,400","US Citizen, National, or Permanent Resident re..."
222,Molecular Biology and Genetics of Cell Signali...,The Summer Research Experience for Undergradua...,https://researchops.web.illinois.edu/opportuni...,Medicine & Health,Summer,Anticipated 1/31/26,Cornell University,"Ithaca, NY",10 weeks,"$7,000","US Citizen, National, or Permanent Resident re..."
223,Illinois Computer Science Summer Research Prog...,"In the , students work with Illinois faculty m...",https://researchops.web.illinois.edu/opportuni...,Science & Technology,Spring,Rolling,University of Illinois at Urbana-Champaign,"Urbana, IL",10 weeks,,No Citizenship Requirements
