In [2]:
# center: politico
# right: daily herald
# (both are used in the original NewB dataset)

In [13]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse

def search_politico_articles(query, max_results=300):
    search_url = "https://www.google.com/search"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    links = []
    for start in range(0, max_results, 100):
        params = {
            "q": f"{query} site:politico.eu",
            "num": 100,
            "start": start
        }
        try:
            response = requests.get(search_url, params=params, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a'):
                href = link.get('href')
                if href and "https://www.politico.eu/article" in href:
                    links.append(href)
        except requests.exceptions.RequestException as e:
            print(f"Error during Google search: {e}")
            break

    return list(set(links))  # remove duplicates


def fetch_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_sentences(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    paragraphs = soup.find_all('p')

    sentences = []
    for paragraph in paragraphs:
        text = paragraph.get_text()
        for sentence in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text):
            if 'Biden' in sentence:
                sentences.append(sentence.strip())
    return sentences

def scrape_politico_biden_sentences(query):
    print("Searching for articles...")
    urls = search_politico_articles(query)
    if not urls:
        print("No articles found")
        return []

    all_sentences = []

    for url in urls:
        print(f"Scraping {url}...")
        html_content = fetch_article(url)
        if html_content:
            sentences = extract_sentences(html_content)
            all_sentences.extend(sentences)

    return all_sentences

if __name__ == "__main__":
    query = "Biden"
    sentences_about_biden = scrape_politico_biden_sentences(query)

    for i, sentence in enumerate(sentences_about_biden, 1):
        print(f"{i}: {sentence}")

    with open("biden_sentences.txt", "w") as file:
        file.write("\n".join(sentences_about_biden))


Searching for articles...
Scraping https://www.politico.eu/article/us-biden-warns-red-line-israel-rafah-cyprus-aid-ship-gets-ready-open-humanitarian-sea-corridor-gaza/...
Scraping https://www.politico.eu/article/europe-gives-us-president-joe-biden-a-one-finger-salute/...
Scraping https://www.politico.eu/article/petro-poroshenko-joe-biden-burisma/...
Scraping https://www.politico.eu/article/china-us-joe-biden-defend-taiwan-invasion/...
Scraping https://www.politico.eu/article/joe-biden-doubts-donald-trump-even-understands-nato-obligations-republican-candidate/...
Scraping https://www.politico.eu/article/joe-biden-summit-of-democracies-can-rally-allies-against-autocracies/...
Scraping https://www.politico.eu/article/revealed-the-us-uk-trade-talks-joe-biden-inherits/...
Scraping https://www.politico.eu/article/joe-biden-viktor-orban-hungary-democracy-summit-snub/...
Scraping https://www.politico.eu/article/joe-biden-germany-donald-trump-russia-ukraine-olaf-scholz/...
Scraping https://www.

In [14]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse

def search_daily_herald_articles(query, max_results=300):
    search_url = "https://www.google.com/search"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    links = []
    for start in range(0, max_results, 100):
        params = {
            "q": f"{query} site:dailyherald.com",
            "num": 100,
            "start": start
        }
        try:
            response = requests.get(search_url, params=params, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all article links within search results
            for link in soup.find_all('a'):
                href = link.get('href')
                if href and "https://www.dailyherald.com/" in href:
                    links.append(href)
        except requests.exceptions.RequestException as e:
            print(f"Error during Google search: {e}")
            break

    return list(set(links))


def fetch_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_sentences(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    paragraphs = soup.find_all('p')

    sentences = []
    for paragraph in paragraphs:
        text = paragraph.get_text()
        for sentence in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text):
            if 'Biden' in sentence:
                sentences.append(sentence.strip())
    return sentences

def scrape_daily_herald_biden_sentences(query):
    print("Searching for articles...")
    urls = search_daily_herald_articles(query)
    if not urls:
        print("No articles found")
        return []

    all_sentences = []

    for url in urls:
        print(f"Scraping {url}...")
        html_content = fetch_article(url)
        if html_content:
            sentences = extract_sentences(html_content)
            all_sentences.extend(sentences)

    return all_sentences

if __name__ == "__main__":
    query = "Biden"
    sentences_about_biden = scrape_daily_herald_biden_sentences(query)

    for i, sentence in enumerate(sentences_about_biden, 1):
        print(f"{i}: {sentence}")

    with open("biden_sentences_daily_herald.txt", "w") as file:
        file.write("\n".join(sentences_about_biden))


Searching for articles...
Scraping https://www.dailyherald.com/20240907/syndicated-columnists/president-biden-served-36-years-as-a-senator-from-delaware-the-most-corporate-friendly-state-in-the/...
Scraping https://www.dailyherald.com/20240726/economy/harris-will-carry-bidens-economic-record-into-the-election-she-hopes-to-turn-it-into-an-asset/...
Scraping https://www.dailyherald.com/20240619/syndicated-columnists/biden-points-the-bill-and-the-blame-elsewhere/...
Scraping https://www.dailyherald.com/20240724/syndicated-columnists/its-too-late-joe/...
Scraping https://www.dailyherald.com/20240411/nation-and-world-politics/biden-will-meet-with-philippine-and-japanese-leaders-as-worry-grows-over-chinas-indo-pacific-action/...
Scraping https://www.dailyherald.com/20211010/news/facts-matter-biden-not-stand-in-received-covid-19-booster-shot/...
Scraping https://www.dailyherald.com/20241126/health-care/biden-proposes-medicare-and-medicaid-cover-costly-weight-loss-drugs-for-millions-of-obese-a