In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [6]:
def fetch_and_clean_text(url):
    """
    Fetches and cleans text from the given URL.
    :param url: The URL to fetch text from.
    :return: Cleaned text or an error message.
    """
    try:
        # Make an HTTP GET request
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract the main text content
        # We can focus on specific tags (e.g., <p>, <div>) or use the whole text
        text_elements = soup.find_all(["p", "div"])
        text = " ".join(element.get_text() for element in text_elements)
        
        # Clean the text
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = text.strip()  # Remove leading/trailing whitespace
        
        # Handle empty text scenario
        if not text:
            return f"Error: No extractable text found at {url}"
        return text
    
    except requests.exceptions.RequestException as e:
        # Handle HTTP and connection errors
        return f"Error: Unable to fetch content from {url}. Exception: {e}"
    except Exception as e:
        # Handle other unexpected errors
        return f"Error: Unexpected error while processing {url}. Exception: {e}"

In [7]:
def process_links(links):
    """
    Processes a list of links, extracting and cleaning text content.
    :param links: List of URLs.
    :return: Dictionary with URLs as keys and cleaned text (or error messages) as values.
    """
    results = {}
    for url in links:
        print(f"Processing: {url}")
        text = fetch_and_clean_text(url)
        results[url] = text
    return results

In [4]:
links = [
    'https://ai.stanford.edu/blog/longer-sequences-next-leap-ai/',
    # Add more URLs as needed
]

# Process the links
extracted_data = process_links(links)

Processing: https://ai.stanford.edu/blog/longer-sequences-next-leap-ai/


In [5]:
extracted_data

{'https://ai.stanford.edu/blog/longer-sequences-next-leap-ai/': 'The Stanford AI Lab Blog About Posts All Conferences Computer Vision Robotics NLP Machine Learning Reinforcement Learning Subscribe SAIL The Stanford AI Lab Blog About Posts All Conferences Computer Vision Robotics NLP Machine Learning Reinforcement Learning Subscribe SAIL All Conferences Computer Vision Robotics NLP Machine Learning Reinforcement Learning Can Longer Sequences Help Take the Next Leap in AI? Chris Ré, Tri Dao, Dan Fu, Karan Goel June 9, 2022 Deep learning has revolutionized machine learning. To a first approximation, deeper has been better. However, there is another dimension to scale these models: the size of the input. Even the world’s most impressive models can only process long-form content by dismembering it into isolated, disconnected chunks of a few hundred words to fit their length requirements. There is a good reason: the ubiquitous Transformer model is an absolute wonder, but it is difficult to s

<h1>Prompting a QA dataset</h1>

In [None]:
all_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"I am trying to create a dataset of quiz questions and answers I can use to fine-tune a model. I want you to create that set of 5 quiz questions and answers using the data I give you below"},
    {"role": "user", "content": f"Here is the data I want you to make quiz questions and answers from: {text}."},
    {"role": "user", "content": "Please format the output as a list of python dictionaries where each dictionary represents one question answer pair. Here is an example of the structure [{'question':extracted question, 'answer':extracted answer}]"},
    {"role": "user", "content": f"Please return the format in a way that I can easily parse it into a python dictionary"}
]