# Demo 12

In [None]:
import re
import pandas as pd
import numpy as np

## Common Crawl

[comcrawl](https://github.com/michaelharms/comcrawl) is a python utility for downloading Common Crawl data

In [None]:
!pip install comcrawl

In [None]:
from comcrawl import IndexClient

client = IndexClient(["2019-47"])

client.search("reddit.com/r/MachineLearning/*")
client.download()

first_page_html = client.results[0]["html"]

In [None]:
len(client.results)

In [None]:
client.results[0].keys()

## Requests & Responses

We can make requests in python using the [requests module](https://docs.python-requests.org/en/master/)

In [None]:
import requests

### Downloading text or csv files

In [None]:
response = requests.get("https://www.gutenberg.org/cache/epub/1404/pg1404.txt")
response

In [None]:
response = requests.get("https://manifesto-project.wzb.eu//tools/documents/2020-2/coded/61320_199211.csv")
response

In [None]:
response.content

**Other aspects of a response**

In [None]:
response.url

In [None]:
response.connection

In [None]:
response.cookies

**Bad Response**

In [None]:
bad_response = requests.get("http://www.scifiscripts.com/scripts/Ghostboogers.txt")
bad_response

### Downloading HTML

Often times, we cant just download a text file directly but we need to actually extract data from a website. Let's look at 
these [Weekly Presidential addresses](https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/saturday-weekly-addresses-radio) as an example.

(back to slides)

## Weekly Saturday Presidential Addresses

### Inspecting elements

Let's find the HTML element that contain's the text from Reagan's address in https://www.presidency.ucsb.edu/documents/radio-address-the-nation-solidarity-and-united-states-relations-with-poland

**Question:** What type of tag is the element that contains the speech?
    

**Question:** What is the id or class name of the tag?

**Question:** Is this the same for another speech? Look at https://www.presidency.ucsb.edu/documents/the-presidents-weekly-address-175 for example

### BeatifulSoup

In [None]:

from bs4 import BeautifulSoup

Home URL:
    <br>
https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/saturday-weekly-addresses-radio

#### Download home page of Saturday Weekly Addresses

In [None]:
domain_url = "https://www.presidency.ucsb.edu/"

In [None]:
home_url = "https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/saturday-weekly-addresses-radio"

response = requests.get(home_url)
response

In [None]:
" ".join(dir(response))

In [None]:
response.content

#### Parse page with BeautifulSoup

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
type(soup)

In [None]:
view_content = soup.find_all("div", "view-content")
view_content

**Question:** What type does `find_all` return?

**Question:** What is the class for each row with links to the weekly address?
    
<details>
<summary>Solution</summary>
   "views-row"

</details>

In [None]:
view_content[0].find_all("div", "views-row")[4]

**Question:** Let's look at a few of the `views-row` divs and see any patterns

In [None]:
view_content[0].find_all("div", "views-row")[0]

In [None]:
# pattern is below

In [None]:
view_content[0].find_all("div", "views-row")[4].find_all('a', {'href': re.compile(r'documents/')})

In [None]:
view_content[0].find_all("div", "views-row")[4].find_all('a', {'href': re.compile(r'documents/')})[0]

In [None]:
view_content[0].find_all("div", "views-row")[4].find_all('a', {'href': re.compile(r'documents/')})[0]['href']

Extract the urls from the "Saturday Weekly Address Page"

In [None]:
address_urls = []
for row in view_content[0].find_all("div", "views-row"):
    address_urls.append(row.find_all('a', {'href': re.compile(r'documents/')})[0]['href'])
address_urls

In [None]:
row.find_all('a', {'href': re.compile(r'documents/')})[0]['href']

#### Parse Individual Address

In [None]:
curr_response = requests.get(domain_url + address_urls[0])
curr_response

In [None]:
curr_soup = BeautifulSoup(curr_response.content)
curr_soup

**Question:** What information should we extract from each address?

- Speech
- President
- Date

##### Extract Speech

**Question:** What class name is the div for the text of the speech?

(Answer below)

In [None]:
curr_soup.find_all("div", "field-docs-content")

In [None]:
curr_soup.find_all("div", "field-docs-content")[0].text

##### Extract President Name

**Question:** What class name is the div for the name of the president?

(Answer below)

In [None]:
curr_soup.find_all("div", "field-docs-person")[0].text

In [None]:
curr_soup.find_all("div", "field-docs-person")[0].find("h3", "diet-title").text

##### Extract Date

**Question:** What class name is the div for the date of the speech?

(Answer below)

In [None]:
curr_soup.find("div", "field-docs-start-date-time").text

In [None]:
curr_soup.find("div", "field-docs-start-date-time").text.strip()

#### Loop through all addresses on the first page

In [None]:
%%time 

speeches, presidents, dates = [], [], []

for address_url in address_urls:
    print(address_url)
    curr_response = requests.get(domain_url + address_url)
    
    curr_soup = BeautifulSoup(curr_response.content)
    
    speeches.append(curr_soup.find_all("div", "field-docs-content")[0].text)
    presidents.append(curr_soup.find_all("div", "field-docs-person")[0].find("h3", "diet-title").text)
    dates.append(curr_soup.find("div", "field-docs-start-date-time").text.strip())

In [None]:
pd.DataFrame({"speech": speeches,
              "president": presidents,
              "date": dates})

#### Loop through all addresses

**Question:** What on the original page can we use to get all other addresses?

https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/saturday-weekly-addresses-radio

**Question:** How can we leverage this?

**Question:** What is another approach we can take via some hard coding?

Run the next cell and lets see if we see a pattern?

In [None]:
soup.find_all("ul", "pagination")

**Question:** What pattern do we notice here?

Let's bring it all together

In [None]:
from tqdm import tqdm
for idx in tqdm(range(100000)):
    np.arange(idx) ** 2

*Note: Run code and then discuss it. The code will take about 3 minutes to run*

In [None]:
%%time

speeches, presidents, dates = [], [], []

response = requests.get(home_url)
for idx in tqdm(range(163, 133, -1)): # for time sake let's change only do this for 30 pages
    home_url = "https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/saturday-weekly-addresses-radio"
    response = requests.get(home_url + f"?page={idx}")
    
    soup = BeautifulSoup(response.content)
    
    view_content = soup.find_all("div", "view-content")
    address_urls = []
    for row in view_content[0].find_all("div", "views-row"):
        address_urls.append(row.find_all('a', {'href': re.compile(r'documents/')})[0]['href'])

        
    for address_url in address_urls:
        curr_response = requests.get(domain_url + address_url)
        
        curr_soup = BeautifulSoup(curr_response.content)


        speeches.append(curr_soup.find_all("div", "field-docs-content")[0].text)
        presidents.append(curr_soup.find_all("div", "field-docs-person")[0].find("h3", "diet-title").text)
        dates.append(curr_soup.find("div", "field-docs-start-date-time").text.strip())
    
    

In [None]:
weekly_addresses_df = pd.DataFrame(
              {"speech": speeches,
              "president": presidents,
              "date": dates})

weekly_addresses_df

Don't run the next cell, it downloads all the speehces. The next cell takes about 20 minutes to run

In [None]:
%%time

speeches, presidents, dates = [], [], []

response = requests.get(home_url)
for idx in tqdm(range(163, 0, -1)): # for time sake let's change only do this for 30 pages
    home_url = "https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/saturday-weekly-addresses-radio"
    response = requests.get(home_url + f"?page={idx}")
    
    soup = BeautifulSoup(response.content)
    
    view_content = soup.find_all("div", "view-content")
    address_urls = []
    for row in view_content[0].find_all("div", "views-row"):
        address_urls.append(row.find_all('a', {'href': re.compile(r'documents/')})[0]['href'])

        
    for address_url in address_urls:
        curr_response = requests.get(domain_url + address_url)
        
        curr_soup = BeautifulSoup(curr_response.content)


        speeches.append(curr_soup.find_all("div", "field-docs-content")[0].text)
        presidents.append(curr_soup.find_all("div", "field-docs-person")[0].find("h3", "diet-title").text)
        dates.append(curr_soup.find("div", "field-docs-start-date-time").text.strip())
    
weekly_addresses_df = pd.DataFrame(
              {"speech": speeches,
              "president": presidents,
              "date": dates})

weekly_addresses_df.to_csv("data/weekly_addresses.csv")  

In [None]:
weekly_addresses_df['date'].unique()

In [None]:
weekly_addresses_df.shape

In [None]:
weekly_addresses_df.to_csv("data/weekly_addresses.csv", index=False)