In [9]:
import requests
from bs4 import BeautifulSoup, Comment
import os
import openai
import re
import ast
import pandas as pd
import time
from tqdm import tqdm
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # Initialize tokenizer

In [10]:
def extract_text_and_urls(url):
    """
    Function to extract the text and URLs from a webpage
    """
    output_string = ""
    try:
        response = requests.get(url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.extract()
        for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):
            comments.extract()

        last_string = None
        for tag in soup.recursiveChildGenerator():
            if tag.name == 'a' and tag.get('href'):  
                output_string += tag.get('href') + '\n'
            elif tag.string:
                text = tag.string.strip()  
                if text and text != last_string: 
                    output_string += repr(text) + '\n'
                    last_string = text  

        tokens = tokenizer.encode(output_string)  # Encode the string into tokens
        token_chunks = [tokens[i:i + 3000] for i in range(0, len(tokens), 3000)]  # Group tokens into sets of 4000
        web_strings = [tokenizer.decode(chunk) for chunk in token_chunks]  # Decode each chunk back into a string



    except (requests.RequestException, ValueError):
        print('Invalid URL or unable to make the request')
        return None

    return web_strings  # Return a list of web strings instead of a single string



In [11]:
def openai_chat_gpt(prompt, text, api_key, model):
    """
    Function to interact with ChatGPT
    """
    openai.api_key = api_key
    message = [
        {"role": "system", "content": text},
        {"role": "user", "content": prompt}
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=message,
        organization="org-TTgWchV9zhTLH5bQKcFyPZpF"
    )
    time.sleep(5)
    return response['choices'][0]['message']['content']


In [12]:
def fetch_institution_name(web_string, api_key, model):
    """
    Function to fetch institution name from given string using OpenAI model
    """
    prompt1 = "Please provide the primary institution name from the following information in a single-item Python list format, without explanation."
    return openai_chat_gpt(prompt1, web_string, api_key, model)


In [13]:
def fetch_faculty_data(web_strings, api_key, model):
    """
    Function to fetch faculty data from given string using OpenAI model
    """
    prompt2 = "Construct a list named 'faculty_list'. Use the information given to form tuples of four elements: faculty member's name, academic title, research area(s), and URL. If no information is available, return an empty list. For any unknown items, use 'N/A'."
    faculty_string_final = ""
    for text in web_strings:
        faculty_string = openai_chat_gpt(prompt2, text, api_key, model)
        if faculty_string:
            faculty_string_final += faculty_string + " "  
    return faculty_string_final


In [14]:
def extract_python_lists_from_string(input_string):
    """
    Function to extract python lists from a string
    """
    pattern = re.compile(r'\w+\s*=\s*\[.*?\]', re.DOTALL)
    matches = pattern.findall(input_string)
    combined_list = []
    for match in matches:
        # split each match into separate list assignments
        assignments = match.split('faculty_list = [')
        for assignment in assignments:
            if not assignment:
                continue
            assignment = '[' + assignment  # add the opening bracket back
            try:
                python_list = ast.literal_eval(assignment)
                if isinstance(python_list, list):
                    combined_list.extend(python_list)
            except (ValueError, SyntaxError):
                print(f"Error converting string to list: {assignment}")
    return combined_list


In [15]:
OPENAI_API_KEY = "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"
MODEL = "gpt-3.5-turbo"
urls = ['https://www.soest.hawaii.edu/oceanography/ocn-people/ocn-faculty/', 
       'https://ceoas.oregonstate.edu/oceanography-faculty']
dfs = [] 

In [16]:
#I should also select another windows of tokens to prevent missing any information. 
for url in urls:
    print(f"Processing {url}")
    web_strings = extract_text_and_urls(url)  # Now this returns a list of strings
    institution_name = fetch_institution_name(web_strings[0], OPENAI_API_KEY, MODEL)
    faculty_string_final = fetch_faculty_data(web_strings, OPENAI_API_KEY, MODEL)
    combined_list = extract_python_lists_from_string(faculty_string_final)
    df = pd.DataFrame(combined_list, columns=['PI_Name', 'Academic_title', 'Research_area', 'URL'])
    df['Institution_Name'] = institution_name
    dfs.append(df)


Processing https://www.soest.hawaii.edu/oceanography/ocn-people/ocn-faculty/


  for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):
Token indices sequence length is longer than the specified maximum sequence length for this model (4679 > 1024). Running this sequence through the model will result in indexing errors


Processing https://ceoas.oregonstate.edu/oceanography-faculty


  for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):


In [17]:
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv('final_dataframe.csv', index=False)
print(final_df)

                  PI_Name                                     Academic_title  \
0               Ho, David                                          Professor   
1           Howell, Steve                               Assistant Researcher   
2             Karl, David                                          Professor   
3         Kealoha, Andrea                                Assistant Professor   
4         Luther, Douglas                                          Professor   
5       McManus, Margaret                                          Professor   
6           Nelson, Craig                                          Professor   
7           Powell, Brian                                          Professor   
8                 Qiu, Bo                                          Professor   
9        Richards, Kelvin                                          Professor   
10   Ruttenberg, Kathleen                                          Professor   
11    Sabine, Christopher               