<a href="https://colab.research.google.com/github/ExCaLBBR/ExCaLBBR_Projects/blob/main/PartisanBiasDetection/LocalScraperTemplate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Dependancies
!pip install requests beautifulsoup4 --quiet

#Import libraries
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# @title Define Utility Functions

#Article Scrapper
def scrape_article_text(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find elements containing text - this part may need customization
        # For example, assuming article text is within <p> tags
        article_text = ' '.join(p.get_text() for p in soup.find_all('p'))
        return article_text
    else:
        return "Error: Unable to fetch the webpage."

#Query for LLM
def query(payload, headers):
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()  # Expecting this to always be a list for simplicity
    else:
        return [{"error": response.text}]

#Political Bias Classification using LLM
def reduce_and_query(initial_text, llm_instructions, reduction_fraction, headers):
    text = initial_text
    while True:
        payload = {"inputs": llm_instructions + text}
        response = query(payload, headers)
        # Since we expect a list, check the first item for an 'error' key
        if "error" in response[0]:
            error_message = response[0]["error"]
            if "token" in error_message.lower():
                # Reduce the text by the specified fraction if it's a token limit error
                new_length = int(len(text) * (1 - reduction_fraction))
                text = text[:new_length]
            else:
                # Return the error message directly if it's a different kind of error
                return error_message
        else:
            # If there's no 'error' key, assume the response is successful
            return response

In [None]:
# CNN Article Link Scraper
page = requests.get('https://www.cnn.com/politics')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.cnn.com/politics"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if i.startswith('/2')]
result = ['https://www.cnn.com' + i for i in linkEnds]
CNN_links = set(result)
CNN_links = list(CNN_links)

In [None]:
#Fox News Article Text Link Scraper
url = 'https://moxie.foxnews.com/google-publisher/politics.xml'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
FoxNews_links = links

#Fox News Article Header Scraper
headers = []
for item in soup.find_all("item"):
  header= str(item)
  i = header.find("<title>")
  j = header.find("</title>")
  x = header[i+7:j]
  y = x.split('\n', 1)[0]
  headers.append(y)
FoxNews_headers = headers

#Fox News Article PubDate Scraper
pubds = []
for item in soup.find_all("item"):
  pubd= str(item)
  i = pubd.find("<pubdate>")
  j = pubd.find("</pubdate>")
  x = pubd[i+9:j]
  y = x.split('\n', 1)[0]
  pubds.append(y)
FoxNews_pubdate = pubds

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
#NYT Article Link Scraper
url = 'https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
NYT_links = links

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
NYT_links

['https://www.nytimes.com/2024/09/16/us/politics/senate-democrats-black-women.html',
 'https://www.nytimes.com/2024/09/11/us/politics/undecided-voters-react-debate.html',
 'https://www.nytimes.com/2024/09/11/us/politics/trump-debate-emotions.html',
 'https://www.nytimes.com/2024/09/11/us/politics/trump-russia-ukraine-war.html',
 'https://www.nytimes.com/2024/09/16/us/politics/musk-deleted-tweet-biden-kamala.html',
 'https://www.nytimes.com/2024/09/16/business/energy-environment/pennsylvania-fracking-natural-gas-trump-harris.html',
 'https://www.nytimes.com/2024/09/15/us/politics/trump-routh-ukraine-interview.html',
 'https://www.nytimes.com/2024/09/15/us/politics/trump-shooting-suspect-routh.html',
 'https://www.nytimes.com/2024/09/15/us/politics/trump-reaction-golf-shooting.html',
 'https://www.nytimes.com/2024/09/15/us/politics/secret-service-trump-shooting.html',
 'https://www.nytimes.com/2024/09/15/us/politics/trump-shooting-golf-course.html',
 'https://www.nytimes.com/2024/09/15/u

In [None]:
#USA Today Article Link Scraper
page = requests.get('https://www.usatoday.com/news/politics/')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.usatoday.com/news/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
'''
   /politics removes livestream & corrections & world politics
   also removes other articles e.g. /investigation (ryan routh article)
'''
linkEnds = [i for i in allLinks if i.startswith('/story/news/politics')]
result = ['https://www.usatoday.com' + i for i in linkEnds]
USAToday_links = set(result)
USAToday_links = list(USAToday_links)

In [None]:
USAToday_links

['https://www.usatoday.com/story/news/politics/elections/2024/09/19/donald-trump-crypto-burgers-video/75293759007/',
 'https://www.usatoday.com/story/news/politics/elections/2024/09/19/trump-melania-memoir/75290013007/',
 'https://www.usatoday.com/story/news/politics/2024/09/18/donald-trumps-latest-campaign-focus-two-assassination-attempts/75268821007/',
 'https://www.usatoday.com/story/news/politics/elections/2024/09/18/teamsters-not-endorse-harris-trump-2024-election/75279971007/',
 'https://www.usatoday.com/story/news/politics/elections/2024/09/19/how-springfield-pet-eating-claim-reached-trump-debate/75289774007/',
 'https://www.usatoday.com/story/news/politics/2024/09/19/threats-supreme-court-justices-alaska-kill/75291365007/',
 'https://www.usatoday.com/story/news/politics/elections/2024/09/18/russian-election-interference-microsoft-report/75277862007/',
 'https://www.usatoday.com/story/news/politics/elections/2024/09/18/joe-rogan-podcast-kamala-trump/75276328007/',
 'https://www.

In [None]:
#NY Post Article Link Scraper
#check lxml thing later
url = 'https://nypost.com/politics/feed/'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  #remove videos
  if not (y.startswith('https://nypost.com/video')):
    links.append(y)
NYPost_links = links

In [None]:
NYPost_links

[]

In [None]:
#People Article Link Scraper
page = requests.get('https://people.com/politics/')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://people.com/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
#identifier for political articles seems to be a 7 digit # that starts w 87 at the end of the link
#check to confirm, at least i got astrology out of there
linkEnds = [i for i in allLinks if (len(i) > 8 and i[-7] == '8' and i[-6] == '7')]
# result = ['https://people.com' + i for i in linkEnds]
People_links = set(linkEnds)
People_links = list(People_links)

In [None]:
People_links

['https://people.com/harris-walz-campaign-selling-20-dollar-friendship-bracelets-nod-taylor-swift-8710089',
 'https://people.com/tim-walz-grateful-for-taylor-swift-endorsement-in-2024-election-as-fellow-cat-owner-8710072',
 'https://people.com/springfield-ohio-schools-buildings-evacuated-over-bomb-threats-8712392',
 'https://people.com/kamala-harris-exits-debate-watch-party-taylor-swift-song-the-man-8710086',
 'https://people.com/caitlin-clark-kylie-kelce-like-taylor-swift-kamala-harris-endorsement-8710307',
 'https://people.com/presidential-debate-full-recap-kamala-harris-donald-trump-8709955',
 'https://people.com/taylor-swift-inspired-billboards-for-kamala-harris-appear-in-times-square-and-las-vegas-ready-for-it-8711886',
 'https://people.com/fred-trump-iii-says-donald-trump-campaigning-will-get-nasty-exclusive-interview-8709522',
 'https://people.com/tim-walz-family-tree-8706831',
 'https://people.com/kamala-harris-donald-trump-handshake-presidential-debate-viral-moment-8710071',
 

In [None]:
#Daily Mail Article Link Scraper
#this one
page = requests.get('https://www.dailymail.co.uk/news/us-politics/index.html')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.dailymail.co.uk/news/us-politics/index.html"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if i.startswith('/news/article')]
result = ['https://www.dailymail.co.uk/' + i for i in linkEnds]
DailyMail_links = set(result)
DailyMail_links = list(DailyMail_links)

In [None]:
#Daily Mail RSS Article Link Scraper - articles not as relevant
'''url = 'https://www.dailymail.co.uk/ushome/index.rss'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
dm_links = links
list(set(DailyMail_links).intersection(dm_links))
'''

  soup = BeautifulSoup(html_page.text, "lxml")


In [None]:
DailyMail_links

['https://www.dailymail.co.uk//news/article-13758613/donald-trump-fitness-age-biden.html',
 'https://www.dailymail.co.uk//news/article-13814919/Donald-Trump-Tim-Walz-family-endorsement.html',
 'https://www.dailymail.co.uk//news/article-13848321/pope-francis-attack-donald-trump-kamala-harris-voters-election.html',
 'https://www.dailymail.co.uk//news/article-13769887/donald-trump-afghanistan-national-security-policy.html',
 'https://www.dailymail.co.uk//news/article-13812383/trump-border-policy-surge-voter-support-2024-election.html',
 'https://www.dailymail.co.uk//news/article-13794527/Donald-Trump-responds-Arlington-Cemetery.html',
 'https://www.dailymail.co.uk//news/article-13864615/george-clooney-offer-donald-trump-political-feud.html',
 'https://www.dailymail.co.uk//news/article-13720397/Tim-Walz-military-service-Kyle-Miller-mother-Minnesota-Kamala-Harris-pick.html',
 'https://www.dailymail.co.uk//news/article-13781693/polls-campaign-donald-trump-kamala-harris-presidential-election.

In [None]:
#Newsweek Article Link Scraper
headers = {"user-agent": "Mozilla/5.0"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
page = requests.get('https://www.newsweek.com/politics',headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.newsweek.com/politics"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if (len(i) > 8 and i[-7] == '1')]
# linkEnds = [i for i in allLinks]
result = ['https://www.newsweek.com' + i for i in linkEnds]
Newsweek_links = set(result)
Newsweek_links = list(Newsweek_links)

# initial error
# print(soup.find_all('body'))
# [<body>
# <center><h1>403 Forbidden</h1></center>
# </body>]

# reference: https://stackoverflow.com/questions/40255128/how-to-parse-the-website-using-beautifulsoup
# reference: https://www.useragentlist.net/

In [None]:
Newsweek_links

['https://www.newsweek.com/donald-trump-new-york-voters-polling-republican-gains-1956321',
 'https://www.newsweek.com/donald-trump-bitcoin-cryptocurrency-price-increase-1956371',
 'https://www.newsweek.com/jd-vance-haitian-immigrants-stories-keep-getting-shot-down-1956452',
 'https://www.newsweek.com/republican-group-launches-taylor-swift-themed-ad-attacking-donald-trump-1956382',
 'https://www.newsweek.com/georgia-election-integrity-coalition-emails-donald-trump-1956151',
 'https://www.newsweek.com/pennsylvania-polling-independnet-voters-kamala-harris-1956190',
 'https://www.newsweek.com/trump-harris-unions-teamsters-endorsement-1956277',
 'https://www.newsweek.com/what-does-project-2025-mean-lgbtq-people-1956315',
 'https://www.newsweek.com/robert-kennedy-jr-investigated-collecting-dead-whale-1956501',
 'https://www.newsweek.com/teamsters-union-endorsement-kamala-harris-joe-biden-pension-1956536',
 'https://www.newsweek.com/supreme-court-assassination-threat-alaska-man-panos-anastasi

In [None]:
#NBC News Article Link Scraper
url = 'https://feeds.nbcnews.com/nbcnews/public/politics'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link = str(item)
  i = link.find("<guid>")
  j = link.find("</guid")
  x = link[i+6:j]
  y = x.split('\n', 1)[0]
  if y[0:4] == 'http':
    links.append(y)
if "https://www.nbcnews.com/"  in links:
  links.remove("https://www.nbcnews.com/")
NBCNews_links = links

In [None]:
NBCNews_links

['https://www.nbcnews.com/politics/2024-election/pro-palestinian-uncommitted-movement-refuses-endorse-harris-rcna171756',
 'https://www.nbcnews.com/news/us-news/judge-extends-temporary-restraining-order-bidens-student-debt-forgiven-rcna171805',
 'https://www.nbcnews.com/nbc-out/out-politics-and-policy/biden-makes-history-12th-senate-confirmed-lgbtq-judge-rcna171868',
 'https://www.nbcnews.com/politics/supreme-court/alaska-man-charged-making-death-threats-supreme-court-justices-rcna171836',
 'https://www.nbcnews.com/politics/white-house/biden-host-quad-leaders-home-delaware-rcna171458',
 'https://www.nbcnews.com/politics/2024-election/pennsylvania-ballots-look-different-voter-errors-rcna171441',
 'https://www.nbcnews.com/news/us-news/baseless-rumors-haitian-immigrants-threaten-unravel-springfield-ohio-rcna170513',
 'https://www.nbcnews.com/news/us-news/first-graders-survived-sandy-hook-will-vote-first-presidential-electio-rcna170471',
 'https://www.nbcnews.com/politics/2024-election/far

In [None]:
#Forbes Article Link Scraper
page = requests.get('https://www.forbes.com/topics/politics/')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.forbes.com/topics/politics/"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
#all articles contain date (e.g. /2024/09/18)
relevantLinks = [i for i in allLinks if '/2' in i]
Forbes_links = set(relevantLinks)
Forbes_links = list(Forbes_links)

In [None]:
Forbes_links

['http://www.forbes.com/sites/saradorn/2024/09/19/trump-vs-harris-2024-polls-harris-leads-by-2-points-in-latest-survey/',
 'http://www.forbes.com/sites/saradorn/2024/09/19/election-2024-swing-state-polls-pennsylvanias-a-dead-heat-as-harris-leads-michigan-trump-takes-arizona/',
 'http://www.forbes.com/sites/johngoodman/2024/09/19/should-the-government-guarantee-everyone-a-minimum-income/',
 'http://www.forbes.com/sites/howardgleckman/2024/09/19/exempting-tips-from-federal-income-tax-would-benefit-very-few-workers/',
 'http://www.forbes.com/sites/saradorn/2024/09/19/government-shutdown-looming-next-month-heres-what-that-means/',
 'http://www.forbes.com/sites/johannacostigan/2024/09/19/china-week-tar-chinese-firms-fails-to-boost-americas/',
 'http://www.forbes.com/sites/tylerroush/2024/09/19/secret-service-investigating-elon-musks-x-post-about-assassination-threats-against-biden-and-harris-report-says/',
 'http://www.forbes.com/sites/joshuastein/2024/09/19/some-suggestions-to-solve-the-ho

In [None]:
#AP News Article Link Scraper
page = requests.get('https://apnews.com/politics')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://apnews.com/politics"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
relevantLinks = [i for i in allLinks if '/article' in i]
APNews_links = set(relevantLinks)
APNews_links = list(APNews_links)

In [None]:
APNews_links

['https://apnews.com/article/iran-fbi-hacking-trump-election-interference-2020-3631e1832a8edb549d53126585503f32',
 'https://apnews.com/article/biden-electric-vehicles-tax-credits-charging-stations-76606327bb1317cf819c5257336b3eb9',
 'https://apnews.com/article/evangelicals-harris-trump-christians-vote-e1f1c5a0806cf3d88e95ff4bdc1fc4db',
 'https://apnews.com/article/jimmy-carter-dayton-peace-prize-holbrooke-e4d9bd1d81d1ecdac9b5ea20eec59e33',
 'https://apnews.com/article/ai-safety-summit-san-francisco-biden-raimondo-d52c31fb1e37508a1d2e78b5cfa5a8e0',
 'https://apnews.com/article/oregon-semiconductor-industry-governor-land-89181a47be64e084c86d6d106229f373',
 'https://apnews.com/article/evangelicals-harris-trump-christians-vote-9d5cb379dc3c2fdb3f4954c556a29ec5',
 'https://apnews.com/article/south-dakota-attorney-general-crash-c70eda77d3e41aae0c6608f5c5b28264',
 'https://apnews.com/article/homelessness-vermont-motels-31da28d4e53a4157005056894a34c07d',
 'https://apnews.com/article/election-20

In [None]:
#BBC Article Link Scraper
#via U.S. Politics page
page = requests.get('https://www.bbc.com/news/topics/cwnpxwzd269t')
soup = BeautifulSoup(page.text, 'html.parser')
allLinks = []
link = "https://www.bbc.com/news/topics/cwnpxwzd269t"
for link in soup.find_all('a'):
    allLinks.append(link.get('href'))
allLinks = filter(None, allLinks)
linkEnds = [i for i in allLinks if '/article' in i]
result = ['https://www.bbc.com' + i for i in linkEnds]
BBC_links = set(result)
BBC_links = list(BBC_links)

In [None]:
BBC_links

['https://www.bbc.com/news/articles/c77l28myezko',
 'https://www.bbc.com/news/articles/cj4x71znwxdo',
 'https://www.bbc.com/news/articles/cm2yyr7283vo',
 'https://www.bbc.com/news/articles/cd6qvg78z8no',
 'https://www.bbc.com/news/articles/c4gqgg4zdzlo',
 'https://www.bbc.com/news/articles/c62r2zzgz4jo',
 'https://www.bbc.com/news/articles/cn8yyy0r7epo',
 'https://www.bbc.com/news/articles/c05j5ezy6nro']

In [None]:
#Washington Post Article Link Scraper - in progress
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
page = requests.get('https://www.washingtonpost.com/politics/',headers=headers)
# soup = BeautifulSoup(page.text, 'html.parser')


# allLinks = []
# link = "https://www.washingtonpost.com/politics/"
# for link in soup.find_all('a'):
#     allLinks.append(link.get('href'))
# allLinks = filter(None, allLinks)
# linkEnds = [i for i in allLinks]
# print(linkEnds)

# linkEnds = [i for i in allLinks if i.startswith('/2')]
# result = ['https://www.cnn.com' + i for i in linkEnds]
# WP_links = set(result)
# CNN_links = list(CNN_links)

#RSS: https://feeds.washingtonpost.com/rss/rss_the-fix?itid=lk_inline_manual_5

KeyboardInterrupt: 

In [None]:
#CNBC Article Link Scraper
url = 'https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000113'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  #removes white space at end of links
  y = x.split('\n', 1)[0].strip()
  links.append(y)
CNBC_links = links
CNBC_links

  soup = BeautifulSoup(html_page.text, "lxml")


['https://www.cnbc.com/2024/09/19/watch-biden-speaks-at-the-economic-club-of-washington-dc.html',
 'https://www.cnbc.com/2024/09/19/lebanese-ministers-warn-of-a-dangerous-next-48-hours-after-pager-attacks.html',
 'https://www.cnbc.com/2024/09/19/biden-student-debt-forgiveness-ruling.html',
 'https://www.cnbc.com/2024/09/19/djt-trump-media-stock-lockup.html',
 'https://www.cnbc.com/2024/09/19/5-things-to-know-before-the-stock-market-opens-thursday-september-19.html',
 'https://www.cnbc.com/2024/09/19/ray-dalio-calls-upcoming-us-election-the-most-consequential-of-his-lifetime.html',
 'https://www.cnbc.com/2024/09/18/djt-trump-media-slips-merger-lockup.html',
 'https://www.cnbc.com/2024/09/18/harvey-weinstein-hit-with-new-sex-crime-charge-in-new-york.html',
 'https://www.cnbc.com/2024/09/18/fed-chair-powell-downplays-rate-cut-trump-harris-election-impact.html',
 'https://www.cnbc.com/2024/09/18/teamsters-no-endorsement-harris-trump-presidential-election.html',
 'https://www.cnbc.com/2024/

In [None]:
#CBS News Article Link Scraper
url = 'https://www.cbsnews.com/latest/rss/politics'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("<link/>")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)
CBS_links = links
CBS_links

  soup = BeautifulSoup(html_page.text, "lxml")


['https://www.cbsnews.com/news/lindsey-graham-nebraska-electoral-college-vote-kamala-harris/',
 'https://www.cbsnews.com/news/amazon-facebook-youtube-federal-trade-commission-privacy-children/',
 'https://www.cbsnews.com/news/zelenskyy-white-house-meeting-biden-harris/',
 'https://www.cbsnews.com/news/uncommitted-movement-pro-palestinian-kamala-harris/',
 'https://www.cbsnews.com/news/supreme-court-threats-panos-anastasiou-alaska/',
 'https://www.cbsnews.com/news/teamsters-2024-no-endorsement/',
 'https://www.cbsnews.com/news/lamonica-mciver-wins-new-jersey-house-election/',
 'https://www.cbsnews.com/sanfrancisco/news/newly-naturalized-latina-us-citizen-ready-vote-november-election/',
 'https://www.cbsnews.com/news/nasas-hidden-figures-congressional-gold-medal-ceremony/',
 'https://www.cbsnews.com/news/iran-hackers-stolen-trump-campaign-info-biden-campaign-fbi-2024-election/',
 'https://www.cbsnews.com/video/trump-security-bolstered-following-apparent-assassination-attempt/',
 'https:/

In [None]:
#ABC News Article Link Scraper - in progress

headers = {"user-agent": "Mozilla/5.0"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
html_page = requests.get('https://abcnews.go.com/abcnews/politicsheadlines',headers=headers)
#lxml-xml needed so it can parse the CDATA (where the links are) ? jk
soup = BeautifulSoup(html_page.text, "xml")
links = []
for item in soup.find_all("item"):
  link= str(item)
  i = link.find("CDATA")
  j = link.find("<guid")
  x = link[i+7:j]
  y = x.split('\n', 1)[0]
  links.append(y)

ABC_links = links
ABC_links

['<media:thumbnail height="288" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_4x3_384.jpg" width="384"/><media:thumbnail height="108" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_4x3_144.jpg" width="144"/><media:thumbnail height="288" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_4x3_384.jpg" width="384"/><media:thumbnail height="135" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_16x9_240.jpg" width="240"/><media:thumbnail height="456" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_4x3_608.jpg" width="608"/><media:thumbnail height="558" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_16x9_992.jpg" width="992"/><media:thumbnail height="900" url="https://s.abcnews.com/images/Politics/wirestory_2d29668ace1ce239b7b6b209409bb6f5_16x9_1600.jpg" width="1600"/

In [None]:
#Extract article text from links for all news sources
#Use this to text article link efficacy?

#CNBC_links includes ad text in the beginning(?)
url = CBS_links[0]
article_text = scrape_article_text(url)
print(article_text)

Watch CBS News 
Updated on:  September 18, 2024 / 10:18 PM EDT
          / CBS/AP
         Iranian hackers sought to interest President Biden's campaign in information stolen from the rival campaign of former President Donald Trump, sending unsolicited emails to people connected to the Democratic president in an effort to interfere in the 2024 election, the FBI and other federal agencies said Wednesday. There's no evidence that any of the recipients responded, officials said, preventing the hacked information from surfacing in the final months of the closely contested election. The hackers sent emails in late June and early July to people who were associated with Mr. Biden's campaign before he dropped out. The emails "contained an excerpt taken from stolen, non-public material from former President Trump's campaign as text in the emails," according to a U.S. government statement. In late July, officials with the FBI, the Office of the Director of National Intelligence and the Departmen

In [None]:
#Article Scraper with User Agent - add to utility functions
def scrape_article_text_useragent(url,ua):
    # Send a GET request to the URL
    response = requests.get(url, headers=ua)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find elements containing text - this part may need customization
        # For example, assuming article text is within <p> tags
        article_text = ' '.join(p.get_text() for p in soup.find_all('p'))
        return article_text
    else:
        return "Error: Unable to fetch the webpage."

In [None]:
#Extract article text from links for news sources requiring user agent
url = Newsweek_links[0]
headers = {"user-agent": "Mozilla/5.0"}
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35
article_text = scrape_article_text_useragent(url, headers)
print(article_text)

While New York is seen as a solid blue state, the Republican party still enjoys widespread support in its rural and suburban areas, and this year the gap between the GOP and the Democrat Party could be narrowing. In the 2024 presidential election, Donald Trump has the potential to gain the largest share of the vote in the Empire State than any other Republican candidate since 1988, some polls have suggested. Trump's very unlikely to win the state and get its 28 electoral votes, but a good performance in the Democratic stronghold gives Republicans a new message to bring to voters and enthusiasm about Trump could help the GOP in down ballot races. Before his rally on Long Island Wednesday night, Trump told supporters in a Manhattan bar that he was going to "win New York!" – a message he repeated to the crowd later on. "When I told some people in Washington that I'm going up to New York, we're doing a campaign speech, they said, 'What do you mean New York? You can't ever ... Republicans c

In [None]:
#List of RSS feeds

#Fox News: Politics
foxPolRSS = f"https://moxie.foxnews.com/google-publisher/politics.xml"
#CNN News: Politics
CNNPolRSS = f"http://rss.cnn.com/rss/cnn_allpolitics.rss"
#NYT News: Politics
NYTPolRSS = f"https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml"
#https://github.com/susannapaoli/web-scraper-nyt
#NY Post: Politics
NYPostPolRSS = f"https://nypost.com/politics/feed/"
#Daily Mail: U.S.
DailyMailUSRSS = f"https://www.dailymail.co.uk/ushome/index.rss"
#NBC News: Politics
NBCPolRSS = f"https://feeds.nbcnews.com/nbcnews/public/politics"
#CNBC: Politics
CNBCPolRSS = f"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000113"
#CBS News: Politics
CBSPolRSS = f"https://www.cbsnews.com/latest/rss/politics"
#ABC News: Politics
ABCPolRSS = f"https://abcnews.go.com/abcnews/politicsheadlines"

In [None]:
api_token_hug =


# Following URL is the URL of the LLM being utilized from HuggingFace
API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
headers = {"Authorization": f"Bearer {api_token_hug}"}

# Instructions sent to the LLM in front of article text
system_input = "Analyze the text content and assign a label from {left, right, center, uncertain}. In this context, “left” indicates a left-leaning article, “right” signifies a right-leaning article, “center” implies no obvious political leaning, and “uncertain” denotes that the political orientation could not be determined. Please provide your analysis and output a new single line containing only the assigned label."

#reduce by 10% until it fits query
reduction_fraction = 0.10



result = reduce_and_query(article_text, system_input, reduction_fraction, headers)
print(result)

second_article = "https://www.cnn.com/2024/03/19/politics/texas-immigration-law-blocked-appeals/index.html"
second_result = reduce_and_query(second_article, system_input, reduction_fraction, headers)
print(second_result)

third_article = "https://www.cnn.com/videos/world/2024/03/20/israel-gaza-west-bank-settler-movement-clarissa-ward-pkg-intl-ldn-vpx.cnn"
third_result = reduce_and_query(third_article, system_input, reduction_fraction, headers)
print(third_result)

fourth_article = "https://www.cnn.com/2024/03/19/politics/trump-bond-deadline-panic/index.html"
fourth_result = reduce_and_query(fourth_article, system_input, reduction_fraction, headers)
print(fourth_result)

fifth_article = "https://www.foxnews.com/live-news/joe-biden-gop-impeachment-inquiry-hearing-hunter-biden-business-dealings"
fifth_result = reduce_and_query(fifth_article, system_input, reduction_fraction, headers)
print(fifth_result)

sixth_article = "https://www.msnbc.com/deadline-white-house/deadline-legal-blog/trump-supreme-court-immunity-appeal-delay-rcna144155"
sixth_result = reduce_and_query(sixth_article, system_input, reduction_fraction, headers)
print(sixth_result)

seventh_article = "https://www.msnbc.com/rachel-maddow-show/maddowblog/biden-white-house-reason-celebrate-falling-crime-rates-rcna144215"
seventh_result = reduce_and_query(seventh_article, system_input, reduction_fraction, headers)

[{'generated_text': 'left'}]
[{'generated_text': 'uncertain'}]
[{'generated_text': 'center'}]
[{'generated_text': 'center'}]
[{'generated_text': 'center'}]
[{'generated_text': 'center'}]
