In [1]:
# npr: lean left
# justthenews: lean right

In [12]:
import random
import requests
from bs4 import BeautifulSoup
import re
from googlesearch import search

def fetch_npr_article_fragments(article_url):
    print(f"Fetching article: {article_url}")

    try:
        response = requests.get(article_url)
        if response.status_code != 200:
            print(f"Failed to fetch article: {article_url}")
            return None

        soup = BeautifulSoup(response.content, "html.parser")

        content_divs = soup.find_all("p")
        content = " ".join(p.get_text(strip=True) for p in content_divs)

        # remove fragments looking like this: "Brendan Smialowski/AFP via Getty Images"
        content = re.sub(r"\b[A-Za-z]+(?: [A-Za-z]+)+/AFP via Getty Images\b", "", content)

        # we want fragments containing "Trump" or "Biden"
        fragments = []
        sentences = re.split(r'(?<=[.!?])\s+', content)

        seen_fragments = set()
        for i, sentence in enumerate(sentences):
            if re.search(r'\b(Trump|Biden)\b', sentence, re.IGNORECASE):
                num_sentences = random.choice([1, 2, 3])
                start_idx = max(0, i - (num_sentences - 1) // 2)
                end_idx = min(len(sentences), start_idx + num_sentences)
                fragment = " ".join(sentences[start_idx:end_idx])

                if fragment not in seen_fragments:
                    seen_fragments.add(fragment)
                    fragments.append(fragment)

        print(f"Relevant fragments: {fragments}")
        return fragments
    except Exception as e:
        print(f"Error fetching article content from {article_url}: {e}")
        return None

def get_npr_articles_via_google(query, max_articles=10):
    fragments = []
    search_query = f"{query} site:npr.org"
    article_links = []

    for link in search(search_query, num=max_articles, tbs="cdr:1,cd_min:1/1/2018,cd_max:12/31/2021"):
        if len(article_links) >= max_articles:
            break
        article_links.append(link)
    article_links = list(dict.fromkeys(article_links)) # remove duplicates

    print(f"Found {len(article_links)} articles for query: {query}")

    for link in article_links:
        article_fragments = fetch_npr_article_fragments(link)
        if article_fragments:
            fragments.extend([(frag, "left") for frag in article_fragments])

    return fragments

queries = [
    "donald trump 2020 elections"
    "donald trump presidential candidate 2020",
    "joe biden 2020 elections",
    "joe biden 2020 presidential rally"
]

all_data = {}
for query in queries:
    print(f"Searching for: {query}")
    query_data = get_npr_articles_via_google(query, max_articles=100)
    all_data[query] = query_data

for query, fragments in all_data.items():
    print(f"\nResults for query '{query}':")
    for fragment, label in fragments:
        print(f"Fragment: {fragment}\nLabel: {label}\n---")


Searching for: donald trump 2020 electionsdonald trump presidential candidate 2020


KeyboardInterrupt: 

In [5]:
for key in all_data.keys():
  print(len(all_data[key]))

1929
2269
2415


In [2]:
import csv

def save_to_csv(data, filename="npr_articles.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Fragment", "Label", "Year"])

        for query, fragments in data.items():
            for fragment, label in fragments:
                year = "2020"
                writer.writerow([fragment, label, year])

# save_to_csv(all_data)


In [3]:
# zerohedge: lean right

from bs4 import BeautifulSoup
import requests
import re
import random
from googlesearch import search

def fetch_zerohedge_article_fragments(article_url):
    print(f"Fetching article: {article_url}")
    try:
        response = requests.get(article_url)
        if response.status_code != 200:
            print(f"Failed to fetch article: {article_url}. response: {response}")
            return None

        soup = BeautifulSoup(response.content, "html.parser")

        content_div = soup.find("div", {"class": "NodeContent_mainContent__2jyAd"})
        if not content_div:
            print(f"No content found at {article_url}")
            return None

        content = " ".join(p.get_text(strip=True) for p in content_div.find_all("p"))
        content = re.sub(r"\[.*?\]|(Photo|Image):.*", "", content)
        print(f"Article content extracted: {content[:200]}...")

        fragments = []
        sentences = re.split(r'(?<=[.!?])\s+', content)
        for i, sentence in enumerate(sentences):
            if re.search(r'\b(Trump|Biden)\b', sentence, re.IGNORECASE):
                num_sentences = random.choice([1, 2, 3])
                start_idx = max(0, i - (num_sentences - 1) // 2)
                end_idx = min(len(sentences), start_idx + num_sentences)
                fragment = " ".join(sentences[start_idx:end_idx])
                fragments.append(fragment)

        print(f"Relevant fragments: {fragments}")
        return fragments

    except Exception as e:
        print(f"Error fetching article content from {article_url}: {e}")
        return None

def get_zerohedge_articles_via_google(query, max_articles=10):
    search_query = f"{query} site:zerohedge.com/news/"
    print(f"Searching Google for: {search_query}")

    article_links = []
    fragments = []

    for url in search(search_query, stop=max_articles): #, tbs="cdr:1,cd_min:1/1/2018,cd_max:12/31/2021"):
        if len(article_links) >= max_articles:
            break
        article_links.append(url)

    print(f"Found {len(article_links)} articles for query {search_query}")

    for link in article_links:
        article_fragments = fetch_zerohedge_article_fragments(link)
        if article_fragments:
            fragments.extend([(frag, "right") for frag in article_fragments])

    return fragments

queries = [
    "donald trump 2020 elections"
    "donald trump presidential candidate 2020",
    "joe biden 2020 elections",
    "joe biden 2020 presidential rally"
]

all_data = {}
for query in queries:
    print(f"Searching for: {query}")
    query_data = get_zerohedge_articles_via_google(query, max_articles=30)
    all_data[query] = query_data

for key, data in all_data.items():
    print(f"Data for {key}: {data}")


Searching for: donald trump 2020 electionsdonald trump presidential candidate 2020
Searching Google for: donald trump 2020 electionsdonald trump presidential candidate 2020 site:zerohedge.com/news/
Found 30 articles for query donald trump 2020 electionsdonald trump presidential candidate 2020 site:zerohedge.com/news/
Fetching article: https://www.zerohedge.com/news/2024-10-30/abc-mistakenly-airs-election-result-showing-harris-winning-pa
Article content extracted: blueapples on XOne of the most confounding responses to the belief that the 2020 Presidential Election was rigged to keep Donald Trump from being re-elected has been the Republicans' belief that votin...
Relevant fragments: ["blueapples on XOne of the most confounding responses to the belief that the 2020 Presidential Election was rigged to keep Donald Trump from being re-elected has been the Republicans' belief that voting in 2024 will somehow get him back into office. In the build up to this year's presidential election,ther

In [4]:
for key in all_data.keys():
  print(len(all_data[key]))

253
226
165


In [5]:
save_to_csv(all_data, "zerohedge.csv")