In [2]:
# npr: lean left
# foxbusiness: lean right

In [7]:
import random
import requests
from bs4 import BeautifulSoup
import re
from googlesearch import search

def fetch_npr_article_fragments(article_url):
    print(f"Fetching article: {article_url}")

    try:
        response = requests.get(article_url)
        if response.status_code != 200:
            print(f"Failed to fetch article: {article_url}")
            return None

        soup = BeautifulSoup(response.content, "html.parser")

        content_divs = soup.find_all("p")
        content = " ".join(p.get_text(strip=True) for p in content_divs)

        # remove fragments looking like this: "Brendan Smialowski/AFP via Getty Images"
        content = re.sub(r"\b[A-Za-z]+(?: [A-Za-z]+)+/AFP via Getty Images\b", "", content)

        # we want fragments containing "Obama" or "Romney"
        fragments = []
        sentences = re.split(r'(?<=[.!?])\s+', content)

        seen_fragments = set()
        for i, sentence in enumerate(sentences):
            if re.search(r'\b(Obama|Romney)\b', sentence, re.IGNORECASE):
                num_sentences = random.choice([1, 2, 3])
                start_idx = max(0, i - (num_sentences - 1) // 2)
                end_idx = min(len(sentences), start_idx + num_sentences)
                fragment = " ".join(sentences[start_idx:end_idx])

                if fragment not in seen_fragments:
                    seen_fragments.add(fragment)
                    fragments.append(fragment)

        print(f"Relevant fragments: {fragments}")
        return fragments
    except Exception as e:
        print(f"Error fetching article content from {article_url}: {e}")
        return None

def get_npr_articles_via_google(query, max_articles=10):
    fragments = []
    search_query = f"{query} site:npr.org"
    article_links = []

    for link in search(search_query, num=max_articles, tbs="cdr:1,cd_min:1/1/2010,cd_max:12/31/2012"):
        if len(article_links) >= max_articles:
            break
        article_links.append(link)
    article_links = list(dict.fromkeys(article_links)) # remove duplicates

    print(f"Found {len(article_links)} articles for query: {query}")

    for link in article_links:
        article_fragments = fetch_npr_article_fragments(link)
        if article_fragments:
            fragments.extend([(frag, "left") for frag in article_fragments])

    return fragments

queries = [
    "barack obama 2012 elections",
    "barack obama 2012 presidential rally",
    "mitt romney 2012 elections",
    "mitt romney presidential rally"
]

all_data = {}
for query in queries:
    print(f"Searching for: {query}")
    query_data = get_npr_articles_via_google(query, max_articles=100)
    all_data[query] = query_data

for query, fragments in all_data.items():
    print(f"\nResults for query '{query}':")
    for fragment, label in fragments:
        print(f"Fragment: {fragment}\nLabel: {label}\n---")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Label: left
---
Fragment: Transcript of Mitt Romney's concession speech in the presidential race in Boston.
Label: left
---
Fragment: (Cheers, applause.) MITT ROMNEY:Thank you. Thank you.
Label: left
---
Fragment: Thank you. I have just called President Obama to congratulate him on his victory. His supporters and his campaign also deserve congratulations.
Label: left
---
Fragment: Mitt Romney delivers his acceptance speech Thursday at the Republican National Convention.Chip Somodevilla/Getty Imageshide caption Mitt Romney delivers his acceptance speech Thursday at the Republican National Convention. Transcript of Republican presidential nominee Mitt Romney's acceptance speech as prepared for delivery at the Republican National Convention: Mr. Chairman, delegates.
Label: left
---
Fragment: They came not just in pursuit of the riches of this world but for the richness of this life. Video: Mitt Romney's speech, from PBS News

In [8]:
for key in all_data.keys():
  print(len(all_data[key]))

1990
2211
2256
1691


In [2]:
import csv

def save_to_csv(data, filename="npr_articles.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Fragment", "Label", "Year"])

        for query, fragments in data.items():
            for fragment, label in fragments:
                year = "2012"
                writer.writerow([fragment, label, year])

save_to_csv(all_data)


In [4]:
# foxbusiness: lean right

from bs4 import BeautifulSoup
import requests
import re
import random
from googlesearch import search

def fetch_foxbusiness_article_fragments(article_url):
    print(f"Fetching article: {article_url}")
    try:
        response = requests.get(article_url)
        if response.status_code != 200:
            print(f"Failed to fetch article: {article_url}. response: {response}")
            return None

        soup = BeautifulSoup(response.content, "html.parser")

        content_div = soup.find("div", {"class": "article-body"})
        if not content_div:
            print(f"No content found at {article_url}")
            return None

        content = " ".join(p.get_text(strip=True) for p in content_div.find_all("p"))
        content = re.sub(r"\[.*?\]|(Photo|Image):.*", "", content)
        print(f"Article content extracted: {content[:200]}...")

        fragments = []
        sentences = re.split(r'(?<=[.!?])\s+', content)
        for i, sentence in enumerate(sentences):
            if re.search(r'\b(Obama|Romney)\b', sentence, re.IGNORECASE):
                num_sentences = random.choice([1, 2, 3])
                start_idx = max(0, i - (num_sentences - 1) // 2)
                end_idx = min(len(sentences), start_idx + num_sentences)
                fragment = " ".join(sentences[start_idx:end_idx])
                fragments.append(fragment)

        print(f"Relevant fragments: {fragments}")
        return fragments

    except Exception as e:
        print(f"Error fetching article content from {article_url}: {e}")
        return None

def get_foxbusiness_articles_via_google(query, max_articles=10):
    search_query = f"{query} site:foxbusiness.com/"
    print(f"Searching Google for: {search_query}")

    article_links = []
    fragments = []

    for url in search(search_query, stop=max_articles): #, tbs="cdr:1,cd_min:1/1/2010,cd_max:12/31/2013"):
        if len(article_links) >= max_articles:
            break
        article_links.append(url)

    print(f"Found {len(article_links)} articles for query {search_query}")

    for link in article_links:
        article_fragments = fetch_foxbusiness_article_fragments(link)
        if article_fragments:
            fragments.extend([(frag, "right") for frag in article_fragments])

    return fragments


queries = [
    "barack obama 2012 elections",
    "barack obama presidential candidate 2012",
    "mitt romney elections",
    "mitt romney presidential rally"
]

all_data = {}
for query in queries:
    print(f"Searching for: {query}")
    query_data = get_foxbusiness_articles_via_google(query, max_articles=30)
    all_data[query] = query_data

for key, data in all_data.items():
    print(f"Data for {key}: {data}")


Searching for: barack obama 2012 elections
Searching Google for: barack obama 2012 elections site:foxbusiness.com/
Found 30 articles for query barack obama 2012 elections site:foxbusiness.com/
Fetching article: https://www.foxbusiness.com/markets/factbox-quotes-from-the-2012-u-s-presidential-election
Article content extracted: Americans went to the polls to vote for president on Tuesday after a tightly contested race between incumbent Democrat Barack Obama and Republican Mitt Romney. Below are some comments made by the cand...
Relevant fragments: ['Americans went to the polls to vote for president on Tuesday after a tightly contested race between incumbent Democrat Barack Obama and Republican Mitt Romney.', 'Americans went to the polls to vote for president on Tuesday after a tightly contested race between incumbent Democrat Barack Obama and Republican Mitt Romney. Below are some comments made by the candidates, observers and voters: OBAMA, tweeting after MSNBC projected his victory: "

In [5]:
for key in all_data.keys():
  print(len(all_data[key]))


259
288
316
206


In [6]:
save_to_csv(all_data, "fox_articles.csv")