In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests

## Chicago Tribune Covid Timeline Scraping (Illinois, World, and US)

In [74]:
url = 'https://graphics.chicagotribune.com/coronavirus-pandemic-timeline/blurb.html?initialWidth=331&childId=timeline-blurb&parentTitle=COVID-19%20timeline%3A%20How%20the%20coronavirus%20pandemic%20unfolded%20-%20Chicago%20Tribune&parentUrl=https%3A%2F%2Fwww.chicagotribune.com%2Fcoronavirus%2Fct-viz-coronavirus-timeline-20200507-uvrzs32nljabrpn6vkzq7m2fpq-story.html'

In [75]:
res = requests.get(url)

In [76]:
res.status_code

200

In [77]:
soup = BeautifulSoup(res.content, 'lxml')

In [81]:
event_log = pd.DataFrame(columns=['region', 'event', 'date', 'links'])

In [276]:
date_list = []
event_list = []
region_list = []
links_list = []
for i in range(len(div_event)): # All the div class='event'
    for _ in div_event[i].find_all('div'): # Number of divs in each event
        date_list.append(div_event[i].find('p', {'class' : "event__time"}).text)
        
    # Thank you John Clements on SO: https://stackoverflow.com/questions/
    # 14257717/python-beautifulsoup-wildcard-attribute-id-search
    for region in div_event[i].find_all('p', {'class' : lambda l: l and l.startswith('subcategory')}):
        region_list.append(region.text)
        
    for event in div_event[i].find_all('div'):
        # Thanks Cyrbil on SO: https://stackoverflow.com/questions/34111426/
        # how-do-i-pull-p-tags-without-attributes-using-beautiful-soup
        event_paragraph_temp = [] 
        for text in event.find_all(lambda tag: tag.name == 'p' and not tag.attrs):
            # Create temporary list because some divs had multiple p tags and they weren't formatted the same
            event_paragraph_temp.append(text.text)
        # Now join the list to event_list as one string
        event_list.append(''.join(event_paragraph_temp))
        
        if event.find('a'): # If there's a link in the event 
            # From https://pythonspot.com/extract-links-from-webpage-beautifulsoup/
            links_list.append(event.find('a').get('href')) 
        else:
            links_list.append(np.nan)
        

    
    

In [277]:
event_log['date'] = date_list

In [278]:
event_log['region'] = region_list

In [279]:
event_log['event'] = event_list

In [280]:
event_log['links'] = links_list

## Texas Covid Timeline Scraping

In [244]:
url = 'https://graphics.texastribune.org/graphics/coronavirus-timeline-2020-06/'

In [245]:
res = requests.get(url)

In [246]:
res.status_code

200

In [247]:
soup = BeautifulSoup(res.content, 'lxml')

In [250]:
div_stories = soup.find_all('div', {'class': 'story-item'})

In [273]:
div_stories[0].find_all('p', {'class': 'story-prose'})

[<p class="story-prose"></p>,
 <p class="story-prose">The novel coronavirus made news in Texas just two months into 2020. In February, San Antonio’s Lackland Air Force Base began housing people who had been overseas and exposed to the new coronavirus.</p>,
 <p class="story-prose">Weeks later, San Antonio Mayor Ron Nirenberg declares a public health emergency over COVID-19 and demands that 120 people who were expected to be released from a two-week quarantine at the base be held longer for additional medical testing. He also <a href="https://www.texastribune.org/2020/03/02/texas-gov-greg-abbott-asks-cdc-fix-coronavirus-quarantine-protocols/">bans quarantine evacuees</a> from entering his city.</p>]

In [301]:
date_list = []
event_list = []
region_list = []
links_list = []

for i in range(len(div_stories)):
    date_list.append(div_stories[i].find('span', {'class': 'story-time'}).text)
    event_temp_list = []
    links_temp_list = []
    for text in div_stories[i].find_all('p', {'class': 'story-prose'}):
        event_temp_list.append(text.text)
        if text.find('a'): # If there's a link in the text 
            links_temp_list.append(text.find('a').get('href')) 
        else:
            pass
    event_list.append(' '.join(event_temp_list))
    links_list.append(links_temp_list)
    

In [262]:
tx_event_log = pd.DataFrame(columns=['region', 'event', 'date', 'links'])

In [263]:
tx_event_log['date'] = date_list

In [283]:
tx_event_log['region'] = 'TEXAS'

In [298]:
tx_event_log['event'] = event_list

In [302]:
tx_event_log['links'] = links_list

## Atlanta Covid Response Timeline

In [304]:
url = 'https://www.atlantaga.gov/government/mayor-s-office/city-of-atlanta-covid-19-response'

In [305]:
res = requests.get(url)

In [306]:
res.status_code

200

In [307]:
soup = BeautifulSoup(res.content, 'lxml')

In [310]:
div = soup.find('div', {'id': 'widget_4_9612_10151'})

In [332]:
li = div.find('ul').find_all('li')

In [374]:
date_list = []
event_list = []

for i in range(len(li)):
    if li[i].find('b'):
        # Most of the dates are within 'b' tags
        date_list.append(li[i].find('b').text)
    else:
        # They seemed to switch up the way their html was laid out halfway through
        date_list.append(li[i].find('strong').text.strip().strip('-').strip())
    # Removed the first 3 words, which were the date
    event_list.append(' '.join(li[i].text.split()[3:])) 

In [355]:
ga_event_log = pd.DataFrame(columns=['region', 'event', 'date', 'links'])

In [356]:
ga_event_log['date'] = date_list

In [358]:
ga_event_log['region'] = 'GEORGIA'

In [378]:
ga_event_log['event'] = event_list

## Joining all the dataframes

In [381]:
master_event_log = pd.concat([event_log, tx_event_log, ga_event_log], axis=0)

In [387]:
master_event_log.loc[~master_event_log['date'].str.endswith('2020'), 'date'] = master_event_log.loc[~master_event_log['date'].str.endswith('2020'), 'date'].apply(lambda x: x + ' 2020')

In [1]:
master_event_log['date'] = pd.to_datetime(master_event_log['date'])

NameError: name 'pd' is not defined

In [3]:
master_event_log = pd.read_csv('./data/master_event_log.csv')

In [10]:
master_event_log = master_event_log[master_event_log['event'].notnull()]

In [12]:
master_event_log.to_csv('./data/master_event_log.csv', index=False)