# ADS 509 Politics Project: APIs and Web Scraping

# Importing Libraries

In [21]:
import os
import datetime
import re

# for the lyrics scrape section
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random

import shutil
from urllib.parse import urljoin
import nbconvert

---

# Politics Scrape

In [22]:
news_urls = {'CNN':"https://www.cnn.com/politics",
             'FOX':"https://www.foxnews.com/politics"} 
# we'll use this dictionary to hold both the organization name and the link to their news

## Part 1: Finding Links to Songs Lyrics

In [23]:
# Let's set up a dictionary of lists to hold our links
news_pages = defaultdict(list)

for org, news_page in news_urls.items() :
    # request the page and sleep
    r = requests.get(news_page)
    time.sleep(5 + 10*random.random())

    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')
        for link in soup.find_all(href=re.compile("/politics/")): 
            # now extract the links to news pages from this page
            news_link = link.get('href')
            news_pages[org].append(news_link)

# References used in this section:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# https://www.geeksforgeeks.org/beautifulsoup-scraping-link-from-html/ 

Let's make sure we have enough lyrics pages to scrape. 

In [24]:
for org, lp in news_pages.items() :
    assert(len(set(lp)) > 20) 

In [25]:
# Let's see how long it's going to take to pull these lyrics if we're waiting `5 + 10*random.random()` seconds 
for org, links in news_pages.items() : 
    print(f"For {org}, we have {len(links)} articles.")
    print(f"The full pull for this news organization will take roughly {round(len(links)*10/3600,2)} hours.")

For CNN, we have 86 articles.
The full pull for this news organization will take roughly 0.24 hours.
For FOX, we have 72 articles.
The full pull for this news organization will take roughly 0.2 hours.


## Part 2: Pulling Articles

Now that we have the links to our article pages, let's go scrape them! Here are the steps for this part. 

1. Create an empty folder in our repo called "politics". 
1. Iterate over the organizations in `news_pages`. 
1. Create a subfolder in politics with the site's name. For instance, if the site was CNN you'd have `politics/CNN/` in your repo.
1. Iterate over the pages. 
1. Request the page and extract the articles from the returned HTML file using BeautifulSoup.
1. Use the function below, `generate_filename_from_url`, to create a filename based on the article page, then write the article to a text file with that name. 


In [26]:
def generate_filename_from_link(link) :
    
    if not link :
        return None
    
    # drop the http or https and the html
    name = link.replace("https","").replace("http","")
    name = link.replace(".html","")

    name = name.replace("/politics/","")
    
    # Replace useless chareacters with UNDERSCORE
    name = name.replace("://","").replace(".","_").replace("/","_")
    
    # tack on .txt
    name = name + ".txt"
    
    return(name)


In [27]:
# Make the politics folder here, deleting the old folder if one already exists.

if os.path.isdir("politics") : 
    shutil.rmtree("politics/")

os.mkdir("politics")

In [28]:
cnn_stub = "https://www.cnn.com"
fox_stub = "https://www.foxnews.com"

In [47]:
start = time.time()
total_pages = 0 

for org, links in news_pages.items() :

    # Build a subfolder for the artist
    site_folder = os.path.join("politics", org)
    os.makedirs(site_folder, exist_ok=True)

    # Iterate over the lyrics pages
    for link in links:
        total_pages += 1

        if org == "CNN" :
            article_url = urljoin(cnn_stub, link)

            if "/2024/" in article_url :
                r_news = requests.get(article_url)
                time.sleep(5 + 10 * random.random())

                if r_news.status_code == 200:
                    soup_news = BeautifulSoup(r_news.text, 'html.parser')
                    #title = soup_news.find('div', class_= "headline_wrapper").get_text()
                    title_element = soup_news.find('h1', class_="headline__text inline-placeholder", id="maincontent")
                    if title_element:
                        title = title_element.get_text()
                    else:
                        print(f"Warning: Title not found for {article_url}")
                        title = "Untitled"
                    news_paragraphs = soup_news.find_all('p', class_="paragraph inline-placeholder")
                    news = '\n'.join([p.get_text(separator='\n') for p in news_paragraphs])
                    #news = soup_news.find_all('div', class_=None, id=None).get_text(separator='\n')

                    # Write out the title, two returns ('\n'), and the lyrics. Use `generate_filename_from_url` to generate the filename. 
                    filename = generate_filename_from_link(link)
                    filepath = os.path.join(site_folder, filename)

                    with open(filepath, 'w', encoding='utf-8') as file:
                        file.write(title + '\n\n' + news)
        
        elif org == "FOX" :
            article_url = urljoin(fox_stub, link)
            if "/category/" not in article_url :
                r_news = requests.get(article_url)
                time.sleep(5 + 10 * random.random())

                if r_news.status_code == 200:
                    soup_news = BeautifulSoup(r_news.text, 'html.parser')
                    #title = soup_news.find('h1', class_= "headline speakable").get_text()
                    title_element = soup_news.find('h1', class_="headline speakable")
                    if title_element:
                        title = title_element.get_text()
                    else:
                        print(f"Warning: Title not found for {article_url}")
                        title = "Untitled"
                    #news = return_text_if_not_none(soup_news.find('div', class_= "article-content"))
                    #news = soup_news.find_all('div', class_= "article-content")
                    news_paragraphs = soup_news.find_all('p')
                    news = '\n'.join([p.get_text(separator='\n') for p in news_paragraphs])
                    
                    # Write out the title, two returns ('\n'), and the lyrics. Use `generate_filename_from_url` to generate the filename. 
                    filename = generate_filename_from_link(link)
                    filepath = os.path.join(site_folder, filename)
                    
                    with open(filepath, 'w', encoding='utf-8') as file:
                        file.write(title + '\n\n' + news)

In [48]:
# Find the total run time.
print(f"Total run time was {round((time.time() - start)/3600,2)} hours.")

Total run time was 0.32 hours.


---

# Evaluation

This assignment asks you to pull data by scraping www.AZLyrics.com.  After you have finished the above sections , run all the cells in this notebook. Print this to PDF and submit it, per the instructions.

In [49]:
# Simple word extractor from Peter Norvig: https://norvig.com/spell-correct.html
def words(text): 
    return re.findall(r'\w+', text.lower())

## Checking Lyrics 

The output from your lyrics scrape should be stored in files located in this path from the directory:
`/lyrics/[Artist Name]/[filename from URL]`. This code summarizes the information at a high level to help the instructor evaluate your work. 

In [50]:
site_folders = os.listdir("politics/")
site_folders = [f for f in site_folders if os.path.isdir("politics/" + f)]

for org in site_folders : 
    site_files = os.listdir("politics/" + org)
    site_files = [f for f in site_files if 'txt' in f or 'csv' in f or 'tsv' in f]

    print(f"For {org} we have {len(site_files)} files.")

    org_words = []

    for f_name in site_files : 
        with open("politics/" + org + "/" + f_name) as infile : 
            org_words.extend(words(infile.read()))

            
    print(f"For {org} we have roughly {len(org_words)} words, {len(set(org_words))} are unique.")


For FOX we have 27 files.
For FOX we have roughly 23169 words, 3388 are unique.
For CNN we have 52 files.
For CNN we have roughly 50605 words, 5826 are unique.
