In [1]:
#import libraries to parse the html and upload the results into a dataframe

from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#note the starting url and set it to a variable so it can be used later

start_url = "https://www.fema.gov/disaster-reporter-data"

#requests the url's text and set that to a variable

page_source = requests.get(start_url).text

In [3]:
#convert the text from the request into a parse friendly versions

soup = BeautifulSoup(page_source, 'lxml')

In [4]:
#navigate to the next link at the bottom of each FEMA page and set it to a variable
#make sure the navigation works

next_link = soup.findChild(attrs={'class':'next'}).find('a').attrs['href']
next_link

'/disaster-reporter-data?page=1'

In [5]:
#make a list of all the columns in the table. 
#we'll use this later to head the data frame

header = ['Disaster Reports', 'Headlines', 'City', 'State',
          'Disaster Type', 'Photo Date', 'Report Date']

#navigate through the requested text to get the data from the table that contains posts of road blocks

rows = soup.find('table').find('tbody').findAll('tr')

#do this for every single row in the table

for row in rows:
    print(row.findAll('td'))

[<td><img alt="Photo: 1615 hwy 9" src="https://www.fema.gov/media-library-data/20180924-2005-19716-1696/disaster_reporter_20180924160539_search_preview.jpg"/><br/><a href="https://www.fema.gov/media-library-data/20180924-2005-19716-1696/disaster_reporter_20180924160539.jpg" title="View full size of - 1615 hwy 9">View Full Size</a><br/><a href="mailto:FEMA-New-Media@fema.dhs.gov?subject=Disaster%20Reporter:%20Review%20Image%20170920" title="Report this image - 1615 hwy 9">Report this Image</a></td>, <td>1615 hwy 9</td>, <td>Longs</td>, <td>SC</td>, <td>Floods</td>, <td>Sat, 09/22/2018 - 13:54</td>, <td class="active">Mon, 09/24/2018 - 16:05</td>]
[<td><img alt="Photo: Southport bc" src="https://www.fema.gov/media-library-data/20180924-1838-18167-5375/disaster_reporter_20180924143810_search_preview.jpg"/><br/><a href="https://www.fema.gov/media-library-data/20180924-1838-18167-5375/disaster_reporter_20180924143810.jpg" title="View full size of - Southport bc">View Full Size</a><br/><a hr

In [6]:
#define a function that does everything described above and add the returned items to empty lists along the way. 

def parse_page(source):
    
    soup = BeautifulSoup(source, "lxml")
    
    ### Scrape the table
    rows = soup.find("table").find("tbody").findAll("tr")

    data = []

    for row in rows:

        data_row = []

        for value in row.findAll("td"):
            data_row.append(value.text)

        data.append(data_row)

    return data

#again, use the code from above in a function to go to the next page 

def fetch_page(url):
    start_url = url
    return requests.get(start_url).text

In [7]:
#put all the functions together to make a semi-automatic scraping tool

start_url = "https://www.fema.gov/disaster-reporter-data"
page_source = fetch_page(start_url)

##navigate to the next link at the bottom of each FEMA page and set it to a variable
#make sure the navigation works

next_link = soup.findChild(attrs={'class':'next'}).find('a').attrs['href']
next_link

#get the requested data, parse through it
all_data = parse_page(page_source)

#using a while loop to scrape - so we need to set a limit other wise all the scraping happens at once. 
#set maximum number of requests and responses
max_fetches = 50
current_page = 0

while True:

#setting the condition to control the number of pages scraped at once. 
    if current_page >= max_fetches:
        break
#conglomerating all the functions into once place to:
    #provide a verbose update of scraping progress
    if next_link:
        print("Fetching next page...{}".format(current_page))
        next_url = 'https://www.fema.gov{}'.format(next_link)
        #create the url to move forward to the next page 
        next_page_source = fetch_page(next_url)
        #make a request for the next page
        page_data = parse_page(next_page_source)
        #parse the next page
        all_data.extend(page_data)
        #add the parsed data to the empty list 
    
    current_page += 1
    #counter to keep track of num of fetches

#parse_page(page_source)

Fetching next page...0
Fetching next page...1
Fetching next page...2
Fetching next page...3
Fetching next page...4
Fetching next page...5
Fetching next page...6
Fetching next page...7
Fetching next page...8
Fetching next page...9
Fetching next page...10
Fetching next page...11
Fetching next page...12
Fetching next page...13
Fetching next page...14
Fetching next page...15
Fetching next page...16
Fetching next page...17
Fetching next page...18
Fetching next page...19
Fetching next page...20
Fetching next page...21
Fetching next page...22
Fetching next page...23
Fetching next page...24
Fetching next page...25
Fetching next page...26
Fetching next page...27
Fetching next page...28
Fetching next page...29
Fetching next page...30
Fetching next page...31
Fetching next page...32
Fetching next page...33
Fetching next page...34
Fetching next page...35
Fetching next page...36
Fetching next page...37
Fetching next page...38
Fetching next page...39
Fetching next page...40
Fetching next page...41
Fe

In [11]:
#put the list into a data frame and use the column names established at the beginning.

df = pd.DataFrame(all_data, columns = header)
df.head(300)

Unnamed: 0,Disaster Reports,Headlines,City,State,Disaster Type,Photo Date,Report Date
0,View Full SizeReport this Image,1615 hwy 9,Longs,SC,Floods,"Sat, 09/22/2018 - 13:54","Mon, 09/24/2018 - 16:05"
1,View Full SizeReport this Image,Southport bc,,,Hurricanes,"Wed, 09/19/2018 - 16:19","Mon, 09/24/2018 - 14:38"
2,View Full SizeReport this Image,Kathy Tripp's crushed barn,Newport,NC,Hurricanes,"Sat, 09/15/2018 - 05:14","Mon, 09/24/2018 - 12:17"
3,View Full SizeReport this Image,Conway sc,,,Hurricanes,"Sat, 09/15/2018 - 10:34","Sun, 09/23/2018 - 08:40"
4,View Full SizeReport this Image,My yard,,,Hurricanes,"Thu, 09/20/2018 - 04:53","Sat, 09/22/2018 - 17:38"
5,View Full SizeReport this Image,CAP PHOTO HMW 8’,,,Hurricanes,"Sat, 09/22/2018 - 16:32","Sat, 09/22/2018 - 16:47"
6,View Full SizeReport this Image,"Target 14, Odell ventures landing, Johnsonvill...",,,Floods,"Sat, 09/22/2018 - 15:43","Sat, 09/22/2018 - 15:44"
7,View Full SizeReport this Image,Our house,,,Floods,"Mon, 09/17/2018 - 05:47","Sat, 09/22/2018 - 15:41"
8,View Full SizeReport this Image,Drive way,,,Floods,"Thu, 09/20/2018 - 09:00","Sat, 09/22/2018 - 15:39"
9,View Full SizeReport this Image,Shelter,Lumberton,NC,Hurricanes,"Sun, 09/16/2018 - 15:51","Tue, 09/18/2018 - 19:41"


In [9]:
#export the data frame to a csv

df.to_csv('fema.csv', encoding = 'UTF8')

In [10]:
df.shape

(1020, 7)