<a href="https://colab.research.google.com/github/ExCaLBBR/ExCaLBBR_Projects/blob/main/PartisanBiasDetection/NewsRSS_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install Dependancies
!pip install requests beautifulsoup4 --quiet

#Import libraries
import requests
from bs4 import BeautifulSoup
import time

In [2]:
# @title Define Utility Functions

#Article Scraper
def scrape_article_text(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find elements containing text - this part may need customization
        # For example, assuming article text is within <p> tags
        article_text = ' '.join(p.get_text() for p in soup.find_all('p'))
        return article_text
    else:
        return "Error: Unable to fetch the webpage."

#Article Scraper with User Agent
def scrape_article_text_useragent(url,ua):
    # Send a GET request to the URL
    response = requests.get(url, headers=ua)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find elements containing text - this part may need customization
        # For example, assuming article text is within <p> tags
        article_text = ' '.join(p.get_text() for p in soup.find_all('p'))
        return article_text
    else:
        return "Error: Unable to fetch the webpage."

#Query for LLM
def query(payload, headers):
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()  # Expecting this to always be a list for simplicity
    else:
        return [{"error": response.text}]

#Political Bias Classification using LLM
def reduce_and_query(initial_text, llm_instructions, reduction_fraction, headers):
    text = initial_text
    while True:
        payload = {"inputs": llm_instructions + text}
        response = query(payload, headers)
        # Since we expect a list, check the first item for an 'error' key
        if "error" in response[0]:
            error_message = response[0]["error"]
            if "token" in error_message.lower():
                # Reduce the text by the specified fraction if it's a token limit error
                new_length = int(len(text) * (1 - reduction_fraction))
                text = text[:new_length]
            else:
                # Return the error message directly if it's a different kind of error
                return error_message
        else:
            # If there's no 'error' key, assume the response is successful
            return response

In [None]:
# @title CNN [completed: links, articleTxt, headers, and pubdate]
# CNN Article Link Scraper
page = requests.get('https://www.cnn.com/politics')
soup = BeautifulSoup(page.text, 'html.parser')

links = []
for l in soup.find_all('a'):
    links.append(l.get('href'))
links = filter(None, links)
linkEnds = [i for i in links if i.startswith('/2') and "/video/" not in i]
result = ['https://www.cnn.com' + i for i in linkEnds]
links = set(result)
links = list(links)

#CNN Article Text Scraper
artTxt = []
for art in links:
  article_text = scrape_article_text(art)
  artTxt.append(article_text)
CNN_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("</h1>")
  x = header[i+8:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div', class_='timestamp vossi-timestamp')
  pubd = str(span)
  i = pubd.find("Published")
  j = pubd.find(">]")
  if i == -1:
    i = pubd.find("Updated")
  x = pubd[i+10:j].lstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
CNN_pubds = pubds
CNN_headers = headers

# #CNN Article Header Scraper
# headers = []
# for h in links:
#   hp = requests.get(h)
#   souph = BeautifulSoup(hp.text, 'html.parser')
#   span = souph.find_all('h1')
#   header= str(span)
#   i = header.find(">")
#   j = header.find("</h1>")
#   x = header[i+8:j]
#   y = x.split('\n', 1)[0]
#   headers.append(y)
# CNN_headers = headers

# #CNN News Article PubDate Scraper
# pubds = []
# for h in links:
#   hp = requests.get(h)
#   souph = BeautifulSoup(hp.text, 'html.parser')
#   span = souph.find_all('div', class_='timestamp vossi-timestamp')
#   pubd = str(span)
#   i = pubd.find("Published")
#   j = pubd.find(">]")
#   if i == -1:
#     i = pubd.find("Updated")
#   x = pubd[i+10:j].lstrip()
#   y = x.split('\n', 1)[0]
#   pubds.append(y)
# CNN_pubdate = pubds

In [None]:
# @title Fox News [completed: links, articleTxt, headers, and pubdate]
#Fox News Article Text Link Scraper
url = 'https://moxie.foxnews.com/google-publisher/politics.xml'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
artTxt = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
  article_text = scrape_article_text(y)
  artTxt.append(article_text)
FoxNews_artTxt = artTxt

#Fox News Article Header Scraper
headers = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
FoxNews_headers = headers

#Fox News Article PubDate Scraper
pubds = []
for item in soup.find_all("item"):
  pubd= str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
FoxNews_pubdate = pubds

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
#NYT Article Link Scraper
url = 'https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
NYT_links = links

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
NYT_links

['https://www.nytimes.com/2024/10/21/us/politics/biden-harris-birth-control-rules.html',
 'https://www.nytimes.com/2024/10/21/us/politics/trump-harris-undecided-voters.html',
 'https://www.nytimes.com/2024/10/20/us/politics/trump-scandals.html',
 'https://www.nytimes.com/2024/10/20/us/politics/jill-stein-harris-trump.html',
 'https://www.nytimes.com/2024/10/21/upshot/trump-harris-polls-election.html',
 'https://www.nytimes.com/2024/10/21/us/politics/secret-service-trump-butler-house-report.html',
 'https://www.nytimes.com/2024/10/21/us/politics/supreme-court-public-corruption.html',
 'https://www.nytimes.com/2024/10/21/health/abortion-pill-mifepristone-lawsuit.html',
 'https://www.nytimes.com/video/us/politics/100000009762791/where-harris-and-trump-stand-on-abortion.html',
 'https://www.nytimes.com/2024/10/21/books/review/economics-business-books.html',
 'https://www.nytimes.com/2024/10/21/us/politics/9-11-defendants-trial.html',
 'https://www.nytimes.com/2024/10/21/business/media/trum

In [None]:
# @title NYT [not completed: articleTxt]
#completed: links, headers, pubdate
#NYT Article Text Link Scraper
url = 'https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml'
html_page = requests.get(NYT_links[0])
soup = BeautifulSoup(html_page.text, "html.parser")
print(soup)
links = []
artTxt = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
  article_text = scrape_article_text(y)
  artTxt.append(article_text)
NYT_artTxt = artTxt

#NYT Article Header Scraper
headers = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
NYT_headers = headers

#NYT Article PubDate Scraper
pubds = []
for item in soup.find_all("item"):
  pubd= str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
NYT_pubds = pubds

<html><head><title>nytimes.com</title><style>#cmsg{animation: A 1.5s;}@keyframes A{0%{opacity:0;}99%{opacity:0;}100%{opacity:1;}}</style></head><body style="margin:0"><p id="cmsg">Please enable JS and disable any ad blocker</p><script data-cfasync="false">var dd={'rt':'c','cid':'AHrlqAAAAAMAMoSxbSIbc8cAImo5kg==','hsh':'499AE34129FA4E4FABC31582C3075D','t':'bv','s':17439,'e':'4045dbb42d70ac0c4371124f8504f1ecd4a1504c229f49ffe1b1045285dc4229','host':'geo.captcha-delivery.com','cookie':'cXxe92Ptu_AqNGRGxu2uvOsM_NCyV4XzMRBFniltyBrPmsa7OkPTil2778dwwZt2aPFKzgfs3yCo8YAV_dXsDsImugfwSiISQZVdgXTqD2gJUQ8PlM1Bc1oDzekki8rn'}</script><script data-cfasync="false" src="https://ct.captcha-delivery.com/c.js"></script></body></html>


In [None]:
NYT_artTxt
#'Error: Unable to fetch the webpage.'
# paywall or maybe not being on campus is the issue

[]

In [None]:
# @title USA Today [completed: links, articleTxt, headers, pubdate]
# issue: published and updated dates included
# being on campus may un-break it though?

#USA Today Article Link Scraper
page = requests.get('https://www.usatoday.com/news/politics/')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.usatoday.com/news/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
'''
   /politics removes livestream & corrections & world politics
   also removes other articles e.g. /investigation (ryan routh article)
'''
linkEnds = [i for i in allLinks if i.startswith('/story/news/politics')]
result = ['https://www.usatoday.com' + i for i in linkEnds]
USAToday_links = set(result)
USAToday_links = list(USAToday_links)

#USA Today Article Text Scraper
artTxt = []
for art in USAToday_links:
  article_text = scrape_article_text(art)
  artTxt.append(article_text)
USAToday_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in USAToday_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("</h1>")
  x = header[i+1:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div', class_='gnt_ar_dt')
  pubd = str(span)
  i = pubd.find("Published")
  j = pubd.find("class")
  if i == -1:
    i = pubd.find("Updated")
  x = pubd[i:j].rstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
USAToday_pubds = pubds
USAToday_headers = headers

In [None]:
USAToday_pubds

['Published: 2:45 p.m. ET Oct. 18, 2024 Updated: 5:12 p.m. ET Oct. 18, 2024"',
 'Published: 5:14 a.m. ET Oct. 20, 2024 Updated: 3:59 p.m. ET Oct. 20, 2024"',
 'Published: 2:08 p.m. ET Oct. 20, 2024 Updated: 2:08 p.m. ET Oct. 20, 2024"',
 'Published: 5:02 a.m. ET Oct. 20, 2024 Updated: 9:03 a.m. ET Oct. 20, 2024"',
 'Published: 8:50 p.m. ET Oct. 20, 2024 Updated: 8:56 p.m. ET Oct. 20, 2024"',
 'Published: 12:16 p.m. ET Oct. 18, 2024 Updated: 12:17 p.m. ET Oct. 18, 2024"',
 'Published: 12:11 p.m. ET Oct. 18, 2024 Updated: 12:11 p.m. ET Oct. 18, 2024"',
 'Published: 9:01 p.m. ET Oct. 19, 2024 Updated: 10:43 p.m. ET Oct. 19, 2024"',
 'Published: 11:08 p.m. ET Oct. 19, 2024 Updated: 2:19 p.m. ET Oct. 20, 2024"',
 'Published: 5:06 a.m. ET Oct. 19, 2024 Updated: 6:37 a.m. ET Oct. 19, 2024"',
 'Published 12:12 p.m. ET Oct. 18, 2024"',
 'Published: 9:19 a.m. ET Oct. 20, 2024 Updated: 11:32 a.m. ET Oct. 20, 2024"',
 'Published: 3:11 p.m. ET Oct. 20, 2024 Updated: 7:59 p.m. ET Oct. 20, 2024"',
 '

In [None]:
#NY Post Article Link Scraper
#check lxml thing later
headers = {"user-agent": "Mozilla/5.0"}
url = 'https://nypost.com/politics/feed/'
html_page = requests.get(url, headers=headers)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link = str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  #remove videos
  if not (y.startswith('https://nypost.com/video')):
    links.append(y)
NYPost_links = links

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
# @title NYPost [completed: links, articleTxt, headers, and pubdate]
#NYPost Article Text Scraper
url = 'https://nypost.com/politics/feed/'
headers = {"user-agent": "Mozilla/5.0"}
html_page = requests.get(url,headers=headers)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
artTxt = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
  article_text = scrape_article_text_useragent(y,headers)
  artTxt.append(article_text)
NYPost_artTxt = artTxt

#NYPost Article Header Scraper
headers = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
NYPost_headers = headers

#NYPost Article PubDate Scraper
pubds = []
for item in soup.find_all("item"):
  pubd = str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
NYPost_pubds = pubds

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
#People Article Link Scraper
#without user-agent
page = requests.get('https://people.com/politics/')
soup = BeautifulSoup(page.text, 'html.parser')

allLinks = []
link = "https://people.com/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
#identifier for political articles seems to be a 7 digit # that starts w 87 at the end of the link
#check to confirm, at least i got astrology out of there
linkEnds = [i for i in allLinks if (len(i) > 8 and i[-7] == '8' and i[-6] == '7')]
# result = ['https://people.com' + i for i in linkEnds]
People_links = set(linkEnds)
People_links = list(People_links)

In [None]:
People_links

['https://people.com/jimmy-carter-fulfills-wish-votes-kamala-harris-8729118',
 'https://people.com/eminem-introduces-barack-obama-at-detroit-rally-for-kamala-harris-8732134',
 'https://people.com/rufus-wainwright-mortified-his-version-of-hallelujah-played-at-trump-rally-8728716',
 'https://people.com/several-ballots-destroyed-in-arizona-mailbox-arson-8733824',
 'https://people.com/olivia-nuzzi-departs-ny-mag-amid-rfk-jr-sexting-scandal-8731645',
 'https://people.com/ethel-kennedy-spicy-valentines-clinton-obama-biden-pelosi-8729729',
 'https://people.com/rfk-jr-wanted-to-possess-and-impregnate-journalist-olivia-nuzzi-her-ex-alleges-in-court-filing-8728707',
 'https://people.com/kentucky-lawmaker-dies-after-plunging-into-empty-swimming-pool-on-lawnmower-8733394',
 'https://people.com/barack-obama-says-he-ran-late-for-rally-because-plane-started-leaking-in-the-air-8732461',
 'https://people.com/kamala-harris-calls-out-fox-news-trimmed-clip-trump-8729804',
 'https://people.com/melania-dona

In [None]:
# @title people [not completed: headers, pubdate]

#People Article Text Scraper
artTxt = []
for art in People_links:
  article_text = scrape_article_text(art)
  artTxt.append(article_text)
People_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in People_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('head', class_="loc head")
  header= str(span)
  i = header.find("<title>")
  j = header.find("<\title")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div', class_='mntl-attribution__item-date')
  pubd = str(span)
  i = pubd.find("Published on")
  j = pubd.find("</div>")
  if i == -1:
    i = pubd.find("Updated on")
  x = pubd[i+10:j].lstrip("on ")
  y = x.split('\n', 1)[0]
  pubds.append(y)
People_pubds = pubds
People_headers = headers

In [None]:
People_artTxt

['The 39th U.S. president, who turned 100 on Oct. 1, filled out a ballot which was placed in a dropbox at the Sumter County Courthouse in Georgia ALEX BRANDON/POOL/AFP via Getty; Hannah Beier/Bloomberg via Getty  Jimmy Carter voiced one remaining wish as he neared 20 months in hospice care, and he just fulfilled it.\n  The former president, who turned 100 on Oct. 1, previously revealed to family that he had one last goal before he dies: "I’m only trying to make it to vote for Kamala Harris," Jimmy said, according to grandson Jason Carter.\n  On Wednesday, Oct. 16, his wish came true. The longest-living U.S. president filled out a ballot for Harris, 59, which was placed in a dropbox at the Sumter County Courthouse near his hometown of Plains, Ga., Jason and Chip Carter confirmed to The Atlanta Journal-Constitution.\n  “I think he feels good,” Chip, 74, said about his dad — who has been in hospice care since February 2023 — filling out the ballot. “It was a good morning for him and good 

In [None]:
# @title [Potenitally discontinuted] Ver 2: People Article Link Scraper
#with user-agent
#try if the other one breaks
'''
#Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
#reference: https://www.stanventures.com/blog/googlebot-user-agent-string/
#Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36
headers = {"user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
#Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
page = requests.get('https://people.com/politics/',headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
# print(soup)
allLinks = []
link = "https://people.com/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
#identifier for political articles seems to be a 7 digit # that starts w 87 at the end of the link
#check to confirm, at least i got astrology out of there
linkEnds = [i for i in allLinks if (len(i) > 8 and i[-7] == '8' and i[-6] == '7')]
# result = ['https://people.com' + i for i in linkEnds]
People_links = set(linkEnds)
People_links = list(People_links)
'''

In [None]:
#Daily Mail Article Link Scraper
#use this one
page = requests.get('https://www.dailymail.co.uk/news/us-politics/index.html')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.dailymail.co.uk/news/us-politics/index.html"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if i.startswith('/news/article')]
result = ['https://www.dailymail.co.uk/' + i for i in linkEnds]
DailyMail_links = set(result)
DailyMail_links = list(DailyMail_links)

In [None]:
len(DailyMail_links)

379

In [None]:
#Daily Mail
#41 seconds for scraping text from 50 articles, should i pull random ones ?
#7*41 = 257/60 = 4.3 mins lord

In [None]:
#Daily Mail Article Text Scraper

# import random
# DailyMail_links_short = random.sample(DailyMail_links, 5)
# DailyMail_links = DailyMail_links_short

artTxt = []
for art in DailyMail_links:
  article_text = scrape_article_text(art)
  artTxt.append(article_text)
DailyMail_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in DailyMail_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("</h1>")
  x = header[i+1:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('time')
  pubd = str(span)
  i = pubd.find("> ")
  j = pubd.find("</time>")
  # if i == -1:
  #   i = pubd.find("Updated")
  x = pubd[i+1:j].lstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
DailyMail_pubds = pubds
DailyMail_headers = headers

In [None]:
# @title [Potenitally discontinuted] Daily Mail RSS Article Link Scraper - articles not as relevant
#don't use this one
'''
url = 'https://www.dailymail.co.uk/ushome/index.rss'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
dm_links = links
list(set(DailyMail_links).intersection(dm_links))
'''

'url = \'https://www.dailymail.co.uk/ushome/index.rss\'\nhtml_page = requests.get(url)\nsoup = BeautifulSoup(html_page.text, "lxml")\nlinks = []\nfor item in soup.find_all("item"):\n  link= str(item)\n  i = link.find("<link/>")\n  j = link.find("<guid")\n  x = link[i+7:j]\n  y = x.split(\'\n\', 1)[0]\n  links.append(y)\ndm_links = links\nlist(set(DailyMail_links).intersection(dm_links))\n'

In [None]:
#Newsweek Article Link Scraper
headers = {"user-agent": "Mozilla/5.0"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
page = requests.get('https://www.newsweek.com/politics',headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.newsweek.com/politics"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if (len(i) > 8 and i[-7] == '1')]
# linkEnds = [i for i in allLinks]
result = ['https://www.newsweek.com' + i for i in linkEnds]
Newsweek_links = set(result)
Newsweek_links = list(Newsweek_links)

# initial error
# print(soup.find_all('body'))
# [<body>
# <center><h1>403 Forbidden</h1></center>
# </body>]

# reference: https://stackoverflow.com/questions/40255128/how-to-parse-the-website-using-beautifulsoup
# reference: https://www.useragentlist.net/

In [None]:
#Newsweek Article Text Scraper
ua = {"user-agent": "Mozilla/5.0"}
links = Newsweek_links
artTxt = []
for art in links:
  article_text = scrape_article_text_useragent(art,ua)
  artTxt.append(article_text)
Newsweek_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in links:
  hp = requests.get(h,headers = ua)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("</h1>")
  x = header[i+1:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('time')
  pubd = str(span)
  i = pubd.find("Published")
  j = pubd.find("</time>")
  if i == -1:
    i = pubd.find("Updated")
  x = pubd[i+13:j].lstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
Newsweek_pubds = pubds
Newsweek_headers = headers

In [None]:
#NBC News Article Link Scraper
url = 'https://feeds.nbcnews.com/nbcnews/public/politics'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link = str(item)
  i = link.find("<guid>")
  j = link.find("</guid")
  x = link[i+6:j]
  y = x.split('\n', 1)[0]
  if y[0:4] == 'http':
    links.append(y)
# probably unnecessary, try adding in if it breaks
links = [x for x in links if x != "https://www.nbcnews.com/"]
# if "https://www.nbcnews.com/"  in links:
#   links.remove("https://www.nbcnews.com/")
NBCNews_links = links

artTxt = []
for y in NBCNews_links:
  article_text = scrape_article_text(y)
  artTxt.append(article_text)
NBCNews_artTxt = artTxt

#NBC Article Header Scraper
headers = []
pubds = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  pubd= str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
NBCNews_headers = headers
NBCNews_pubds = pubds

In [None]:
#Forbes Article Link Scraper
page = requests.get('https://www.forbes.com/topics/politics/')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.forbes.com/topics/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
#all articles contain date (e.g. /2024/09/18)
relevantLinks = [i for i in allLinks if '/2' in i]
Forbes_links = set(relevantLinks)
Forbes_links = list(Forbes_links)

In [None]:
# @title forbes [not completed: pubdate]
#Forbes Article Text Scraper
artTxt = []
for art in Forbes_links:
  article_text = scrape_article_text(art)
  artTxt.append(article_text)
Forbes_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
times = []
for h in Forbes_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("</h1>")
  x = header[i+1:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div')
  pubd = str(span)
  i = pubd.find("<time>")
  j = pubd.find("</time>")
  # if i == -1:
  #   span = souph.find_all('div', class_ = "top-stories__date")
  #   pubd = str(span)
  #   i = pubd.find("<time>")
  #   j = pubd.find("</time>")
  x = pubd[i:j]
  y = x.split('\n', 1)[0]

  span = souph.find_all('span', class_ = 'time')
  pubd = str(span)
  k = pubd.find("<time>")
  l = pubd.find("</time>")
  z = pubd[k:l]
  t = z.split('\n', 1)[0]
  times.append(t)

  pubds.append(y)

# if len(times) == len(pubds):
#   Forbes_pubds = [pubds[i] + times[i] for i in range(len(times))]
#   Forbes_pubds = [i.replace("time>", "").replace("<", "") for i in Forbes_pubds]
Forbes_headers = headers

In [None]:
Forbes_links

['http://www.forbes.com/sites/corinnepost/2024/10/27/what-the-us-elections-mean-for-women-business-leaders/',
 'http://www.forbes.com/sites/mollybohannon/2024/10/27/melania-trump-makes-rare-campaign-appearance-at-trumps-madison-square-garden-rally/',
 'http://www.forbes.com/sites/ianpalmer/2024/10/27/fracking-in-pennsylvania-the-truth-behind-the-politics/',
 'http://www.forbes.com/sites/walterpavlo/2024/10/27/steve-bannon-to-be-released-from-prison-this-week-right-on-time/',
 'http://www.forbes.com/sites/maryroeloffs/2024/10/27/harris-and-trumps-biggest-celebrity-endorsements-bad-bunny-shows-support-for-harris/',
 'http://www.forbes.com/sites/saradorn/2024/10/27/trump-vs-harris-2024-polls-trump-up-in-3-new-surveys-harris-leads-in-2-others/',
 'http://www.forbes.com/sites/mollybohannon/2024/10/27/trump-rally-speaker-calls-puerto-rico-floating-island-of-garbage-campaign-says-joke-doesnt-reflect-trumps-views/',
 'http://www.forbes.com/sites/marshallshepherd/2024/10/27/a-look-at-the-us-ele

In [None]:
Forbes_pubds

['time>Oct 22, 2024,<time>06:30am EDT',
 'time>Oct 27, 2024,<time>07:18pm EDT',
 'time>Oct 26, 2024,<time>06:30am EDT',
 'time>Oct 25, 2024,<time>06:00am EDT',
 'time>Oct 27, 2024,<time>07:18pm EDT',
 'time>Oct 27, 2024,<time>10:00am EDT',
 'time>Oct 27, 2024,<time>09:00pm EDT',
 '[<div id="__next"><script>',
 'time>Oct 27, 2024,<time>06:30am EDT',
 '[<div id="__next"><script>']

In [None]:
# len(Forbes_artTxt)
Forbes_pubds
# len(Forbes_links)

['time>Oct 22, 2024,<time>06:30am EDT',
 'time>Oct 27, 2024,<time>07:18pm EDT',
 'time>Oct 26, 2024,<time>06:30am EDT',
 'time>Oct 25, 2024,<time>06:00am EDT',
 'time>Oct 27, 2024,<time>07:18pm EDT',
 'time>Oct 27, 2024,<time>10:00am EDT',
 'time>Oct 27, 2024,<time>09:00pm EDT',
 '[<div id="__next"><script>',
 'time>Oct 27, 2024,<time>06:30am EDT',
 '[<div id="__next"><script>']

In [None]:
#AP News Article Link Scraper
page = requests.get('https://apnews.com/politics')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://apnews.com/politics"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
relevantLinks = [i for i in allLinks if '/article' in i]
APNews_links = set(relevantLinks)
APNews_links = list(APNews_links)

In [None]:
# @title APNews [not completed: pubdate]

#AP News Article Text Scraper
# artTxt = []
# for art in APNews_links:
#   article_text = scrape_article_text(art)
#   artTxt.append(article_text)
# APNews_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in APNews_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("</h1>")
  x = header[i+1:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div', class_ = "page-datemodified")
  pubd = str(span)
  i = pubd.find("Updated")
  j = pubd.find("<")
  # if i == -1:
  #   i = pubd.find("Updated")
  x = pubd[i:j].lstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
APNews_pubds = pubds
APNews_headers = headers

In [None]:
APNews_pubds

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [None]:
# @title BBC [not completed: pubdate -> seems to be in "days ago" rather than numeric date, check for real date"]

#BBC Article Link Scraper
#via U.S. Politics page
page = requests.get('https://www.bbc.com/news/topics/cwnpxwzd269t')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.bbc.com/news/topics/cwnpxwzd269t"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if '/article' in i]
result = ['https://www.bbc.com' + i for i in linkEnds]
BBC_links = set(result)
BBC_links = list(BBC_links)

In [None]:
#BBC Article Text Scraper
artTxt = []
for art in BBC_links:
  article_text = scrape_article_text(art)
  artTxt.append(article_text)
BBC_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in BBC_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('h1')
  header= str(span)
  i = header.find(">")
  j = header.find("<!")
  x = header[i+1:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div')
  pubd = str(span)
  i = pubd.find("<time")
  j = pubd.find("</time>")
  # if i == -1:
  #   i = pubd.find("Updated")
  x = pubd[i+10:j].lstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
BBC_pubds = pubds
BBC_headers = headers

In [3]:
#Washington Post Article Link Scraper - in progress
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
#headers = {"user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
#headers = {"user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
#User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35

page = requests.get('https://www.washingtonpost.com/politics/',headers=headers)
# print(BS(page.content, 'lxml'))
soup = BeautifulSoup(page.text, 'html.parser')
# print(soup)

allLinks = []
link = "https://www.washingtonpost.com/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if "/politics/" and "/2" in i]
links = [i if i.startswith("https") else "https://www.washingtonpost.com/" + i for i in linkEnds]

WP_links = set(links)
WP_links = list(WP_links)
WP_links

#RSS: https://feeds.washingtonpost.com/rss/rss_the-fix?itid=lk_inline_manual_5

['https://www.washingtonpost.com//politics/2024/10/30/mike-johnson-trump-aca-obamacare/?itid=mr_politics_5',
 'https://www.washingtonpost.com//elections/interactive/2024/presidential-polling-averages/?itid=mr_politics_1',
 'https://www.washingtonpost.com//politics/2024/10/30/democracy-trump-pennsylvania-voter-fraud/?itid=mr_politics_4',
 'https://www.washingtonpost.com/politics/2024/10/30/nicky-jam-reggaeton-singer-pulls-endorsement-trump-after-puerto-rico-insult/',
 'https://www.washingtonpost.com/elections/2024/10/30/trump-harris-election-live-updates/',
 'https://www.washingtonpost.com/politics/2024/10/30/kamala-harris-biden-garbage-comment/',
 'https://www.washingtonpost.com/discussions/2021/01/01/rss-terms-service/',
 'https://www.washingtonpost.com/politics/2024/10/28/can-independent-dan-osborn-win-nebraska-would-it-matter/',
 'https://www.washingtonpost.com/politics/2024/10/30/harris-trump-ellipse-speeches/',
 'https://www.washingtonpost.com/national-security/2024/10/30/us-weapo

In [79]:
# @title WP [not completed: pubds]

#WP Article Text Scraper
ua = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
artTxt = []
for art in WP_links:
  article_text = scrape_article_text_useragent(art,ua)
  artTxt.append(article_text)
WP_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

for h in WP_links:
  hp = requests.get(h,headers = ua)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('head')
  header= str(span)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i:j]
  y = x.split('\n', 1)[0].lstrip("<title>")
  headers.append(y)
  span = souph.find_all("div")
  pubd = str(span)
  # print(span)
  for m in months:
    if m in pubd:
      i = pubd.find(m)
  j = pubd.find("</time>")
  if i == -1:
    i = pubd.find("Updated")
  x = pubd[i:j].lstrip()
  y = x.split('\n', 1)[0]
  sep = '</span>'
  dateOnly = y.split(sep, 1)[0]
  pubds.append(dateOnly)

WP_pubds = pubds
# WP_headers = headers

In [77]:
WP_pubds

['October 30, 2024 at 5:00 a.m. EDT',
 'October 30, 2024',
 'October 30, 2024 at 5:21 p.m. EDT',
 'October 30, 2024 at 3:44 p.m. EDT',
 'October 30, 2024 at 10:18 p.m. EDT',
 'October 30, 2024 at 6:26 p.m. EDT',
 'January 1, 2021 at 11:14 a.m. EST',
 'October 28, 2024 at 7:16 p.m. EDT',
 'October 30, 2024 at 10:43 a.m. EDT',
 'October 30, 2024 at 4:30 p.m. EDT',
 'October 30, 2024 at 5:21 p.m. EDT',
 'November 20, 2020 at 11:09 a.m. EST',
 'October 30, 2024 at 6:14 p.m. EDT',
 'October 30, 2024 at 5:22 p.m. EDT',
 'October 30, 2024 at 2:01 p.m. EDT',
 'October 30, 2024 at 6:03 p.m. EDT',
 'October 30, 2024 at 12:41 p.m. EDT',
 'September 11, 2024 at 4:23 p.m. EDT',
 'October 30, 2024',
 'October 30, 2024 at 2:41 p.m. EDT',
 'May 13, 2021 at 4:51 p.m. EDT',
 'October 28, 2024 at 1:18 p.m. EDT',
 '',
 'October 30, 2024 at 12:33 p.m. EDT',
 'October 30, 2024 at 3:34 p.m. EDT',
 '',
 'October 21, 2024 at 2:58 p.m. EDT',
 '',
 'October 30, 2024 at 7:42 p.m. EDT',
 'October 30, 2024 at 12:04

In [8]:
len(WP_artTxt)

33

In [80]:
#CNBC Article Link Scraper
url = 'https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000113'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  #removes white space at end of links
  y = x.split('\n', 1)[0].strip()
  links.append(y)
CNBC_links = links

  soup = BeautifulSoup(html_page.text, "lxml")


In [89]:
# @title CNBC [not completed: artTxt]
url = 'https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000113'
# ua = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}

html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
artTxt = []

for item in soup.find_all("item"):
  link = str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
  article_text = scrape_article_text(y)
  artTxt.append(article_text)
CNBC_artTxt = artTxt

# CNBC Header Scraper
headers = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
CNBC_headers = headers

#CNBC PubDate Scraper
pubds = []
for item in soup.find_all("item"):
  pubd= str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
CNBC_pubds = pubds

  soup = BeautifulSoup(html_page.text, "lxml")


In [90]:
CNBC_artTxt

['Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',
 'Error: Unable to fetch the webpage.',


In [91]:
#CBS News Article Link Scraper
url = 'https://www.cbsnews.com/latest/rss/politics'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  if "/news/" in y:
    links.append(y)
CBS_links = links
CBS_links

  soup = BeautifulSoup(html_page.text, "lxml")


['https://www.cbsnews.com/news/bidens-response-to-garbage-joke-about-puerto-rico/',
 'https://www.cbsnews.com/news/elon-musk-philadelphia-hearing-larry-krasner/',
 'https://www.cbsnews.com/news/harris-responds-biden-garbage-comment/',
 'https://www.cbsnews.com/news/nicky-jam-trump-endorsement-puerto-rico-madison-square-garden/',
 'https://www.cbsnews.com/news/north-korean-troops-russian-uniforms-moving-towards-ukraine/',
 'https://www.cbsnews.com/news/trump-rally-puerto-rico-garbage-comment-congress-races/',
 'https://www.cbsnews.com/news/american-airstrikes-kill-up-to-35-isis-operatives-us-military-says/',
 'https://www.cbsnews.com/news/trump-noncitizen-voter-fraud-fact-check/',
 'https://www.cbsnews.com/news/republican-candidate-larry-savage-charged-stealing-ballots/',
 'https://www.cbsnews.com/news/supreme-court-virginia-voter-rolls/',
 'https://www.cbsnews.com/news/early-voting-2024-election-55-million-30-10-2024/',
 'https://www.cbsnews.com/news/ballot-drop-boxes-misinformation-th

In [92]:
url = 'https://www.cbsnews.com/latest/rss/politics'
# ua = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}

html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
artTxt = []

for item in soup.find_all("item"):
  link = str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
  article_text = scrape_article_text(y)
  artTxt.append(article_text)
CBS_artTxt = artTxt

# CNBC Header Scraper
headers = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
CBS_headers = headers

#CNBC PubDate Scraper
pubds = []
for item in soup.find_all("item"):
  pubd= str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
CBS_pubds = pubds

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
# @title [Potenitally discontinuted] ABC News Article Link Scraper - in progress

# headers = {"user-agent": "Mozilla/5.0"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
html_page = requests.get('https://abcnews.go.com/abcnews/politicsheadlines')
#lxml-xml needed so it can parse the CDATA (where the links are) ? jk
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("CDATA")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)

ABC_links = links
ABC_links

In [96]:
#ABC Article Link Scraper html
page = requests.get('https://abcnews.go.com/Politics')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://abcnews.go.com/Politics"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
links = [i for i in allLinks if "/Politics/" in i and "/video/" not in i]
ABC_links = set(links)
ABC_links = list(ABC_links)

In [100]:
ABC_links

['https://abcnews.go.com/Politics/slavery-involuntary-servitude-ballot-states/story?id=115270058',
 'https://abcnews.go.com/Politics/white-house-clean-bidens-garbage-comment-trump-seizes/story?id=115315459',
 'https://abcnews.go.com/Politics/trump-campaign-distances-house-speakers-plan-massive-reform/story?id=115313086',
 'https://abcnews.go.com/Politics/week-election-day-harris-trump-campaigns-meet-transition/story?id=115284077',
 'https://abcnews.go.com/Politics/rfk-jr-trump-promised-control-public-health-agencies/story?id=115303649',
 'https://abcnews.go.com/Politics/harris-deliver-closing-argument-remarks-dcs-ellipse/story?id=115222481',
 'https://abcnews.go.com/Politics/mail-voting-millions-voters-opt-paper-ballots/story?id=115271747',
 'https://abcnews.go.com/Politics/latino-political-strategist-responds-puerto-rico-jokes-trump/story?id=115215759',
 'https://abcnews.go.com/Politics/harris-trump-capable-interfering-election-results/story?id=115323208',
 'https://abcnews.go.com/Pol

In [105]:
# @title ABC [not completed: headers,pubds]

page = requests.get('https://abcnews.go.com/Politics')
soup = BeautifulSoup(page.text, 'html.parser')

#CNN Article Text Scraper
# artTxt = []
# for art in ABC_links:
#   article_text = scrape_article_text(art)
#   artTxt.append(article_text)
# ABC_artTxt = artTxt

#combined headers + pubdate:
pubds = []
headers = []
for h in ABC_links:
  hp = requests.get(h)
  souph = BeautifulSoup(hp.text, 'html.parser')
  span = souph.find_all('div')
  header= str(span)
  i = header.find("xAPpq JQYD ZdbeE  jTKbV zIIsP xpuOU pCRh")
  print(i)
  #uhh
  j = header.find("</div>")
  x = header[i+8:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
  span = souph.find_all('div')
  pubd = str(span)
  i = pubd.find("Published")
  j = pubd.find(">]")
  if i == -1:
    i = pubd.find("Updated")
  x = pubd[i+10:j].lstrip()
  y = x.split('\n', 1)[0]
  pubds.append(y)
ABC_pubds = pubds
ABC_headers = headers

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1


In [None]:
#Extract article text from links for all news sources
#Use this to text article link efficacy?

#CNBC_links includes ad text in the beginning(?)
url = NYT_links[0]
article_text = scrape_article_text(url)
print(article_text)

Error: Unable to fetch the webpage.


In [None]:
#Extract article text from links for news sources requiring user agent
url = Newsweek_links[0]
headers = {"user-agent": "Mozilla/5.0"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
article_text = scrape_article_text_useragent(url, headers)
print(article_text)

While New York is seen as a solid blue state, the Republican party still enjoys widespread support in its rural and suburban areas, and this year the gap between the GOP and the Democrat Party could be narrowing. In the 2024 presidential election, Donald Trump has the potential to gain the largest share of the vote in the Empire State than any other Republican candidate since 1988, some polls have suggested. Trump's very unlikely to win the state and get its 28 electoral votes, but a good performance in the Democratic stronghold gives Republicans a new message to bring to voters and enthusiasm about Trump could help the GOP in down ballot races. Before his rally on Long Island Wednesday night, Trump told supporters in a Manhattan bar that he was going to "win New York!" – a message he repeated to the crowd later on. "When I told some people in Washington that I'm going up to New York, we're doing a campaign speech, they said, 'What do you mean New York? You can't ever ... Republicans c

In [None]:
#List of RSS feeds

#Fox News: Politics
foxPolRSS = f"https://moxie.foxnews.com/google-publisher/politics.xml"
#CNN News: Politics
CNNPolRSS = f"http://rss.cnn.com/rss/cnn_allpolitics.rss"
#NYT News: Politics
NYTPolRSS = f"https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml"
#https://github.com/susannapaoli/web-scraper-nyt
#NY Post: Politics
NYPostPolRSS = f"https://nypost.com/politics/feed/"
#Daily Mail: U.S.
DailyMailUSRSS = f"https://www.dailymail.co.uk/ushome/index.rss"
#NBC News: Politics
NBCPolRSS = f"https://feeds.nbcnews.com/nbcnews/public/politics"
#CNBC: Politics
CNBCPolRSS = f"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000113"
#CBS News: Politics
CBSPolRSS = f"https://www.cbsnews.com/latest/rss/politics"
#ABC News: Politics
ABCPolRSS = f"https://abcnews.go.com/abcnews/politicsheadlines"

In [None]:
# @title [likly discontinuted]


# Following URL is the URL of the LLM being utilized from HuggingFace
API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
headers = {"Authorization": f"Bearer {api_token_hug}"}

# Instructions sent to the LLM in front of article text
system_input = "Analyze the text content and assign a label from {left, right, center, uncertain}. In this context, “left” indicates a left-leaning article, “right” signifies a right-leaning article, “center” implies no obvious political leaning, and “uncertain” denotes that the political orientation could not be determined. Please provide your analysis and output a new single line containing only the assigned label."

#reduce by 10% until it fits query
reduction_fraction = 0.10



result = reduce_and_query(article_text, system_input, reduction_fraction, headers)
print(result)

second_article = "https://www.cnn.com/2024/03/19/politics/texas-immigration-law-blocked-appeals/index.html"
second_result = reduce_and_query(second_article, system_input, reduction_fraction, headers)
print(second_result)

third_article = "https://www.cnn.com/videos/world/2024/03/20/israel-gaza-west-bank-settler-movement-clarissa-ward-pkg-intl-ldn-vpx.cnn"
third_result = reduce_and_query(third_article, system_input, reduction_fraction, headers)
print(third_result)

fourth_article = "https://www.cnn.com/2024/03/19/politics/trump-bond-deadline-panic/index.html"
fourth_result = reduce_and_query(fourth_article, system_input, reduction_fraction, headers)
print(fourth_result)

fifth_article = "https://www.foxnews.com/live-news/joe-biden-gop-impeachment-inquiry-hearing-hunter-biden-business-dealings"
fifth_result = reduce_and_query(fifth_article, system_input, reduction_fraction, headers)
print(fifth_result)

sixth_article = "https://www.msnbc.com/deadline-white-house/deadline-legal-blog/trump-supreme-court-immunity-appeal-delay-rcna144155"
sixth_result = reduce_and_query(sixth_article, system_input, reduction_fraction, headers)
print(sixth_result)

seventh_article = "https://www.msnbc.com/rachel-maddow-show/maddowblog/biden-white-house-reason-celebrate-falling-crime-rates-rcna144215"
seventh_result = reduce_and_query(seventh_article, system_input, reduction_fraction, headers)

[{'generated_text': 'left'}]
[{'generated_text': 'uncertain'}]
[{'generated_text': 'center'}]
[{'generated_text': 'center'}]
[{'generated_text': 'center'}]
[{'generated_text': 'center'}]
