## Extracting the earthquake data using beautiful soup and exporrting it to csv
### Data cleaning tasks include:
1. Replace empty strings with NaN
2. Remove the footnotes from the 'Other Source Deaths' column
3. Convert Magnitude to a numeric
4. Create a new column ('deaths') that evaluates the four total-death columns ('PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', and 'Other Source Deaths') and populates the new column with the highest value.
5. Explore the data in terms of when and where earthquakes occurred and how severe they were (magnitude, deaths, secondary effects).

In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import plotly.plotly as py
from shapely.geometry import Point
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from matplotlib import style
import seaborn as sns
# Activate Seaborn
sns.set()
%matplotlib inline
# Treemap
import squarify 

#### The [Requests](http://docs.python-requests.org/en/master/user/quickstart/) package makes working with HTTP easier

In [16]:
# creating a Request object for the wikipedia page for deadly earthquakes since 1900
response = requests.get('https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900')

# reading the resulting HTML into a variable called result_text
result_text = response.text

In [17]:
print(type(response))
print(type(result_text))

<class 'requests.models.Response'>
<class 'str'>


In [18]:
#result_text

#### Using Beautiful Soup package to pull data from HTML files

In [19]:
# creating the soup by constructing a BS object from the html page and the appropriate parser
soup = BS(result_text, 'html.parser')

In [20]:
#getting table headers using HTML tag 'th'
column_headers = [th.getText() for th in 
                  soup.findAll('th')]

In [21]:
#getting table rows using HTML tag 'tr'
table_rows = soup.findAll('tr')[6:]


In [22]:
# parse out data in rows# parse 

earthquake = [[td.getText() for td in table_rows[i].findAll('td')]
            for i in range(len(table_rows))]

In [23]:
#table_rows

In [24]:
df_earthquake = pd.DataFrame(earthquake, columns=column_headers)
df_earthquake.head()

Unnamed: 0,Origin (UTC),Present-day country and link to Wikipedia article,Lat,Long,Depth (km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths
0,1900-07-12 06:25,Turkey,40.3,43.1,,5.9 Muk,,,,140.0,,\n
1,1900-10-29 09:11,Venezuela,11.0,-66.0,0.0,7.7 Mw,,,,,,\n
2,1901-02-15 00:00,China,26.0,100.1,0.0,6.5 Ms,,,,,,\n
3,1901-03-31 07:11,Bulgaria,43.4,28.7,,6.4 Muk,,,,4.0,,\n
4,1901-08-09 09:23,Japan,40.5,142.5,35.0,7.2 Mw,T,,,,,\n


In [25]:
column_headers

['Origin (UTC)',
 'Present-day country and link to Wikipedia article',
 'Lat',
 'Long',
 'Depth (km)',
 'Magnitude',
 'Secondary Effects',
 'PDE Shaking Deaths',
 'PDE Total Deaths',
 'Utsu Total Deaths',
 'EM-DAT Total Deaths',
 'Other Source Deaths\n']

In [26]:
df_earthquake.columns

Index(['Origin (UTC)', 'Present-day country and link to Wikipedia article',
       'Lat', 'Long', 'Depth (km)', 'Magnitude', 'Secondary Effects',
       'PDE Shaking Deaths', 'PDE Total Deaths', 'Utsu Total Deaths',
       'EM-DAT Total Deaths', 'Other Source Deaths\n'],
      dtype='object')

In [27]:
df_earthquake.columns = ['origin','country','lat',
              'lng','depth','magnitude','secondary_effects',
              'pde_shaking_deaths','pde_total_deaths','utsu_total_deaths',
              'emdat_total_deaths', 'other_deaths'];

## exporting data as csv to be used for analysis.

In [28]:
df_earthquake.to_csv('earthquake_data.csv')

## please refer to the other code file for analysis: "DQ3-Earthquake_AS"