## Data-question-3
### Earthquakes 
- Nate Silver discusses the difficulty of predicting earthquakes in **The Signal and the Noise**, but are there factors that make an earthquake more likely? 
- Are there factors that make an earthquake more deadly? 
- Where would you live if you wanted to eliminate the risk of experiencing an earthquake? 
- Where would you avoid living?

## Team Name- High Voltage
### Team Members:
- Rachael Abram
- Bernes
- Ashutosh Singhal


#### Data cleaning tasks include:
1. Replace empty strings with NaN
2. Remove the footnotes from the 'Other Source Deaths' column
3. Convert Magnitude to a numeric
4. Create a new column ('deaths') that evaluates the four total-death columns ('PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', and 'Other Source Deaths') and populates the new column with the highest value.
5. Explore the data in terms of when and where earthquakes occurred and how severe they were (magnitude, deaths, secondary effects).

In [18]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import plotly.plotly as py
from shapely.geometry import Point
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from matplotlib import style
import seaborn as sns
# Activate Seaborn
sns.set()
%matplotlib inline
# Treemap
import squarify 

#### The [Requests](http://docs.python-requests.org/en/master/user/quickstart/) package makes working with HTTP easier

In [19]:
# creating a Request object for the wikipedia page for deadly earthquakes since 1900
response = requests.get('https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900')

# reading the resulting HTML into a variable called result_text
result_text = response.text

In [20]:
print(type(response))
print(type(result_text))

<class 'requests.models.Response'>
<class 'str'>


In [21]:
#result_text

#### Using Beautiful Soup package to pull data from HTML files

In [22]:
# creating the soup by constructing a BS object from the html page and the appropriate parser
soup = BS(result_text, 'html.parser')

In [23]:
#getting table headers using HTML tag 'th'
column_headers = [th.getText() for th in 
                  soup.findAll('th')]

In [24]:
#getting table rows using HTML tag 'tr'
table_rows = soup.findAll('tr')[6:]


In [25]:
# parse out data in rows# parse 

earthquake = [[td.getText() for td in table_rows[i].findAll('td')]
            for i in range(len(table_rows))]

In [26]:
#table_rows

In [27]:
df_earthquake = pd.DataFrame(earthquake, columns=column_headers)
df_earthquake.head()

Unnamed: 0,Origin (UTC),Present-day country and link to Wikipedia article,Lat,Long,Depth (km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths
0,1900-05-11 17:23,Japan,38.7,141.1,5.0,7.0 MJMA,,,,,,\n
1,1900-07-12 06:25,Turkey,40.3,43.1,,5.9 Muk,,,,140.0,,\n
2,1900-10-29 09:11,Venezuela,11.0,-66.0,0.0,7.7 Mw,,,,,,\n
3,1901-02-15 00:00,China,26.0,100.1,0.0,6.5 Ms,,,,,,\n
4,1901-03-31 07:11,Bulgaria,43.4,28.7,,6.4 Muk,,,,4.0,,\n


In [28]:
column_headers

['Origin (UTC)',
 'Present-day country and link to Wikipedia article',
 'Lat',
 'Long',
 'Depth (km)',
 'Magnitude',
 'Secondary Effects',
 'PDE Shaking Deaths',
 'PDE Total Deaths',
 'Utsu Total Deaths',
 'EM-DAT Total Deaths',
 'Other Source Deaths\n']

In [29]:
df_earthquake.columns

Index(['Origin (UTC)', 'Present-day country and link to Wikipedia article',
       'Lat', 'Long', 'Depth (km)', 'Magnitude', 'Secondary Effects',
       'PDE Shaking Deaths', 'PDE Total Deaths', 'Utsu Total Deaths',
       'EM-DAT Total Deaths', 'Other Source Deaths\n'],
      dtype='object')

In [30]:
df_earthquake.columns = ['origin','country','lat',
              'lng','depth','magnitude','secondary_effects',
              'pde_shaking_deaths','pde_total_deaths','utsu_total_deaths',
              'emdat_total_deaths', 'other_deaths'];

In [31]:
df_earthquake.to_csv('earthquake_data.csv')

In [32]:
df_earthquake.dtypes

origin                object
country               object
lat                   object
lng                   object
depth                 object
magnitude             object
secondary_effects     object
pde_shaking_deaths    object
pde_total_deaths      object
utsu_total_deaths     object
emdat_total_deaths    object
other_deaths          object
dtype: object

In [33]:
df_earthquake.other_deaths = df_earthquake.other_deaths.str[:-1]

In [34]:
#replacing NoneType/blank cells with NaN
df_earthquake = df_earthquake.replace('', np.NaN)

In [35]:
#df_earthquake = df_earthquake.drop([413])

In [36]:
# converting origin column to date time format
df_earthquake['origin'] = pd.to_datetime(df_earthquake.origin)

In [37]:
#resolving any weird data errors that have a comma, footnote, or aren't formatted correctly to convert to number

df_earthquake.at[43, 'other_deaths'] = 8000
df_earthquake.at[66,'other_deaths'] = 164
df_earthquake.at[99,'other_deaths'] = 1500
df_earthquake.at[1078,'other_deaths'] = 45000
df_earthquake.at[1082,'other_deaths'] = 2489
df_earthquake.at[1206,'other_deaths'] = 26000
df_earthquake.at[1238,'other_deaths'] = 283000
df_earthquake.at[1339,'other_deaths'] = 601
df_earthquake.at[1338,'other_deaths'] = 111
df_earthquake.at[1336,'other_deaths'] = 150
df_earthquake.at[1335,'other_deaths'] = 15894
df_earthquake.at[1329,'other_deaths'] = 521
df_earthquake.at[1328,'other_deaths'] = 221517
df_earthquake.at[1318,'other_deaths'] = 60
df_earthquake.at[439,'emdat_total_deaths'] = '10'
df_earthquake.at[413,'emdat_total_deaths'] = ''
df_earthquake.at[958,'other_deaths'] = '33'
df_earthquake.at[1327,'other_deaths'] = '1115'
df_earthquake.at[1332,'other_deaths'] = '2698'

In [38]:
#df_earthquake = df_earthquake.drop([438])

In [39]:
#df_earthquake.iloc[(1330)]

In [40]:
df_earthquake1 = df_earthquake.drop([413, 437, 439,1319,1320,1323,1324,1325,1326,1327,1330,1331,1332,1333,1334,1336])

In [41]:
df_earthquake1.columns

Index(['origin', 'country', 'lat', 'lng', 'depth', 'magnitude',
       'secondary_effects', 'pde_shaking_deaths', 'pde_total_deaths',
       'utsu_total_deaths', 'emdat_total_deaths', 'other_deaths'],
      dtype='object')

In [42]:
type(df_earthquake1)

pandas.core.frame.DataFrame

In [43]:
df_earthquake1['lat'] = pd.to_numeric(df_earthquake1['lat'])
df_earthquake1['lng'] = pd.to_numeric(df_earthquake1['lng'])


In [44]:
# Removing alphabet and cleaning up magnitude column
df_earthquake1.magnitude = df_earthquake1.magnitude.str[0:3]

In [45]:
#df_earthquake.at[438, 'magnitude'] = "5.0"

In [46]:
df_earthquake.magnitude = pd.to_numeric(df_earthquake1.magnitude)

In [47]:
#converting all deaths columns to numeric

df_earthquake1.pde_shaking_deaths = pd.to_numeric(df_earthquake1.pde_shaking_deaths)
df_earthquake1.pde_total_deaths = pd.to_numeric(df_earthquake1.pde_total_deaths)
df_earthquake1.utsu_total_deaths = pd.to_numeric(df_earthquake1.utsu_total_deaths)
df_earthquake1.emdat_total_deaths = pd.to_numeric(df_earthquake1.emdat_total_deaths)
df_earthquake1.other_deaths = pd.to_numeric(df_earthquake1.other_deaths)


In [48]:
#create a column called total deaths that has the max of the deaths columns and fill 0 in empy rows

df_earthquake1['total_deaths'] = df_earthquake1.iloc[:,-5:].max(axis = 1)
df_earthquake1['total_deaths'] = df_earthquake1.total_deaths.fillna(0)

In [49]:
#Cleaning the 'country' column--removing the parenthesis and whatever inside of it
df_earthquake1['country'] = df_earthquake1.country.str.replace(r"\(.*\)","")
df_earthquake1.country = df_earthquake1.country.str.replace(r"\(.*","")

#Magnitude is an estimate of the relative "size" or strength of an earthquake, 
and thus its potential for causing ground-shaking. It is "approximately related to the released seismic energy.

In [50]:
df_earthquake1.head()

Unnamed: 0,origin,country,lat,lng,depth,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,emdat_total_deaths,other_deaths,total_deaths
0,1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0,,,,,,,0.0
1,1900-07-12 06:25:00,Turkey,40.3,43.1,,5.9,,,,140.0,,,140.0
2,1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.7,,,,,,,0.0
3,1901-02-15 00:00:00,China,26.0,100.1,0.0,6.5,,,,,,,0.0
4,1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.4,,,,4.0,,,4.0


In [51]:
#df_earthquake['Year'] = df_earthquake["origin"].str[:4]
#df_earthquake.head()


In [52]:
#df_earthquake = df_earthquake[['Year'] + df_earthquake.columns[:-2].tolist()]
#df_earthquake = df_earthquake[['country'] + df_earthquake.columns[:-1].tolist()]

In [53]:
#df_earthquake.columns

# What magnitude has high frequency in the data 

In [None]:
df_earthquake1.magnitude.hist(bins=10)
plt.title("Frequency of Magnitude", fontweight='bold', size=25)
plt.xlabel('Magnitude', fontweight='bold', size=20)
plt.ylabel('Frequency', fontweight='bold', size=20);

# How magnitude of EQ affects death toll

In [None]:
df_earthquake.columns

In [None]:
eqdata = df_earthquake.drop(['depth', 'pde_shaking_deaths','pde_total_deaths',
              'utsu_total_deaths','emdat_total_deaths','other_deaths'], axis = 1)

In [None]:
eqdata.head()

In [None]:
eq_below5 = eqdata[eqdata['magnitude'] < 5.0]

In [None]:
eq_5 = eqdata[(eqdata['magnitude'] >= 5.0) & (eqdata['magnitude'] <= 5.9)]

In [None]:
eq_6 = eqdata[(eqdata['magnitude'] >= 6.0) & (eqdata['magnitude'] <= 6.9)]

In [None]:
eq_7 = eqdata[(eqdata['magnitude'] >= 7.0) & (eqdata['magnitude'] <= 7.9)]

In [None]:
eq_8 = eqdata[(eqdata['magnitude'] >= 8.0) & (eqdata['magnitude'] <= 8.9)]

In [None]:
eq_9 = eqdata[(eqdata['magnitude'] >= 9.0) & (eqdata['magnitude'] <= 9.9)]

In [None]:
mag_death = eqdata[(eqdata['magnitude'] >= 6.0)]

In [None]:
mag_vs_death = pd.DataFrame(mag_death.groupby('magnitude')['total_deaths'].sum())

In [None]:
mag_vs_death = mag_vs_death.sort_values('magnitude').reset_index()

In [None]:
mag_vs_death.hist(bins= 6);

In [None]:
bar_mag_vs_death = mag_vs_death.plot.bar(x='magnitude', y='total_deaths', rot=0, 
                              title ='Effect of Magnitude on Number of Deaths (>1000) per Earthquake', legend = False, 
                          figsize = (20,10))
plt.xlabel('Magnitude', fontweight='bold')
plt.ylabel("Mean Number of Deaths", fontweight='bold');

In [None]:
#import plotly.plotly as py

In [None]:
#eq_9.total_deaths.sum()

In [None]:
fig = plt.figure(figsize=(15, 10))
plt.style.use('ggplot')
fontsize = 10

ax1 = fig.add_subplot(231)
plt.xlabel("Magnitude", fontweight='bold')
plt.ylabel("Deaths (x1000)", fontweight='bold')
plt.title('Total Deaths = 310')
for tick in ax1.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax1.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
    
ax2 = fig.add_subplot(232)
plt.xlabel("Magnitude", fontweight='bold')
plt.ylabel("Deaths (x1000)", fontweight='bold')
plt.title('Total Deaths = 39,456')
for tick in ax2.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax2.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
    
ax3 = fig.add_subplot(233)
plt.xlabel("Magnitude", fontweight='bold')
plt.ylabel("Deaths (x1000)", fontweight='bold')
plt.title('Total Deaths = 236,954')
for tick in ax3.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax3.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
    
ax4 = fig.add_subplot(234)
plt.xlabel("Magnitude", fontweight='bold')
plt.ylabel("Deaths (x1000)", fontweight='bold')
plt.title('Total Deaths = 1,285,692')
for tick in ax4.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax4.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
    
ax5 = fig.add_subplot(235)
plt.xlabel("Magnitude", fontweight='bold')
plt.ylabel("Deaths (x1000)", fontweight='bold')
plt.title('Total Deaths = 340,863')
for tick in ax5.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax5.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
    
ax6 = fig.add_subplot(236)
plt.xlabel("Magnitude", fontweight='bold')
plt.ylabel("Deaths (x1000)", fontweight='bold')
plt.title('Total Deaths = 304,725')
for tick in ax6.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax6.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')

ax1.scatter(eq_below5['magnitude'], eq_below5['total_deaths']/1000, alpha=0.5)
ax2.scatter(eq_5['magnitude'], eq_5['total_deaths']/1000, alpha=0.5)
ax3.scatter(eq_6['magnitude'], eq_6['total_deaths']/1000, alpha=0.5)
ax4.scatter(eq_7['magnitude'], eq_7['total_deaths']/1000, alpha=0.5)
ax5.scatter(eq_8['magnitude'], eq_8['total_deaths']/1000, alpha=0.5)
ax6.scatter(eq_9['magnitude'], eq_9['total_deaths']/1000, alpha=0.5);


# Most deadly earthquake on TreeMAP:

In [None]:
# Selecting Earthquakes caused deaths more than 10000
eqdata_deaths = eqdata[(eqdata['total_deaths'] >= 10000)]

In [None]:
# Droping columns no needed for TreeMap
eqdata_sqarify = eqdata_deaths.drop(['origin', 'lat', 'lng'], axis=1)
eqdata_sqarify.head()

In [None]:
#replace all NaN in secondary effects to None
eqdata_sqarify['secondary_effects'] = eqdata_sqarify['secondary_effects'].replace(np.NaN,'None')

In [None]:
import matplotlib

In [None]:
# Sorting data based on # of Total Deaths and country
eqdata_sqarify = eqdata_sqarify.sort_values(by=['total_deaths', 'country'], ascending=False)
#eqdata_sqarify.head()

In [None]:
# Create Treemap Labels
eqdata_sqarify["Label"] = eqdata_sqarify["country"] + " (" + eqdata_sqarify["magnitude"].astype("str") + " )" + eqdata_sqarify["secondary_effects"] 

In [None]:
#Creating TreeMap
plt.rcParams['figure.figsize'] = (20,20)
plt.rc('font', size=30)
# Use ggplot style
style.use('ggplot')

In [None]:
#Colormap
cmap = matplotlib.cm.YlOrRd
# Min and Max Values
mini = min(eqdata_sqarify["total_deaths"])
maxi = max(eqdata_sqarify["total_deaths"])
# Finding Colors for each tile
norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in eqdata_sqarify["total_deaths"]]
squarify.plot(sizes=eqdata_sqarify['total_deaths'], label=eqdata_sqarify['Label'], alpha=0.8, color=colors)
plt.axis('off') 
img = plt.imshow([eqdata_sqarify.total_deaths], cmap=cmap)
img.set_visible(False)
plt.colorbar(img, orientation="vertical", shrink=.5)
plt.title("Most deadly earthquakes", fontsize=40)

#plt.text(1, 0, "Source:....", fontsize=20, ha="center")

plt.show()


# Effect of secondary effects on death toll

In [None]:
eqdata_sqarify['secondary_effects'] = eqdata_sqarify['secondary_effects'].replace(np.NaN,'None')

In [None]:
# Sorting data based on # of Total Deaths and secondary effects
eqdata_sqarify = eqdata_sqarify.sort_values(by=['total_deaths', 'secondary_effects'], ascending=False)

In [None]:
eqdata_sqarify["Label2"] = eqdata_sqarify["secondary_effects"] + " (" + eqdata_sqarify["magnitude"].astype("str") + " )"

In [None]:
#Creating TreeMap
plt.rcParams['figure.figsize'] = (20,15)
plt.rc('font', size=30)
# Use ggplot style
style.use('ggplot')
#Colormap
cmap = matplotlib.cm.YlOrRd
# Min and Max Values
mini = min(eqdata_sqarify["total_deaths"])
maxi = max(eqdata_sqarify["total_deaths"])
# Finding Colors for each tile
norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in eqdata_sqarify["total_deaths"]]
squarify.plot(sizes=eqdata_sqarify['total_deaths'], label=eqdata_sqarify['Label2'], alpha=0.8, color=colors, padding=1)
plt.axis('off') 
img = plt.imshow([eqdata_sqarify.total_deaths], cmap=cmap)
img.set_visible(False)
plt.colorbar(img, orientation="vertical", shrink=.5)
plt.title("Secondary effects, magnitude and deaths", fontsize=30)
#plt.legend("Secondary effects, magnitude and deaths", fontsize=10)
plt.show()


# Earthquake on the world map

In [None]:
from folium import plugins
from folium.plugins import HeatMap
import datetime
from folium.plugins import MarkerCluster

warnings.filterwarnings("ignore")

In [None]:
df_earthquake['geometry'] = df_earthquake.apply(lambda x: Point((float(x.lng), float(x.lat))), axis=1)
#df_earthquake.head()

In [None]:
lat=0
long=0

map=folium.Map(location=[lat,long],zoom_start=2)

In [None]:
for row in eqdata_deaths.iterrows():
    row_values = row[1]
    location = [row_values['lat'], row_values['lng']]
    marker = folium.CircleMarker(location=location, radius=5)
    marker.add_to(map)
    
    def color(magnitude):
    
        if magnitude<5:
            col='green'
        elif [(magnitude>5)&(magnitude<6)]:
            col='yellow'
        else:
            col='red'
        return col

map.add_child(MarkerCluster(locations=list(zip(eqdata_deaths['lat'], 
                                 eqdata_deaths['lng'])),
                                 popups=eqdata_deaths['magnitude'].astype(str),
                                 icons=[color(m) for m in eqdata_deaths['magnitude']]))


map

In [None]:
map1=folium.Map(location=[lat,long],zoom_start=2)
for row in eqdata_deaths.iterrows():
    row_values = row[1]
    location = [row_values['lat'], row_values['lng']]
    marker = folium.CircleMarker(location=location, radius=10)
    marker.add_to(map1)
    
    def color(deaths):
    
        if deaths<50000:
            col='green'
        elif [(deaths>50000)&(deaths<200000)]:
            col='yellow'
        else:
            col='red'
        return col

map1.add_child(MarkerCluster(locations=list(zip(eqdata_deaths['lat'], 
                                 eqdata_deaths['lng'])),
                                 popups=eqdata_deaths['total_deaths'].astype(str),
                                 icons=[color(m) for m in eqdata_deaths['total_deaths']]))


map1

In [None]:
import altair as alt
from altair.expr import datum
from vega_datasets import data

In [None]:
alt.renderers.enable('notebook')

In [None]:
source = data.disasters.url

alt.Chart(source).mark_circle(
    opacity=0.8,
    stroke='black',
    strokeWidth=1
).encode(
    alt.X('Year:O', axis=alt.Axis(labelAngle=0)),
    alt.Y('Entity:N'),
    alt.Size('Deaths:Q',
        scale=alt.Scale(range=[0, 5000]),
        legend=alt.Legend(title='Annual Global Deaths')
    ),
    alt.Color('Entity:N', legend=None)
).properties(
    width=480,
    height=350
).transform_filter(
    datum.Entity != 'All natural disasters'
)

In [None]:
#Making plot less busy - clustering by regions. Choosing other markers.#Making p 
map1 = folium.Map(location=[30, 30], zoom_start=1)
#We make a dictionary giving each region its own cluster
cluster_dic = {"region_{}".format(region): folium.MarkerCluster().add_to(map_1) for region in pd.unique(eq_8["REGION"])}
for country in pd.unique(earthquakes["NAME"]):
    lon_lat_coun = earthquakes.loc[earthquakes["NAME"] == country,["LATITUDE", "LONGITUDE", "Date", "EQ_PRIMARY", "REGION"]].values
    for x, y, date, magnitude, region in lon_lat_coun:
        folium.Marker(
            location=[x,y],
            popup='{} - Earthquake of {} Mg.'.format(date, magnitude),
            icon=folium.Icon(color='red', icon='asterisk'),
            ).add_to(cluster_dic["region_{}".format(region)])

In [None]:
# Make a data frame with dots to show on the map
data = pd.DataFrame({
   'lat':[-58, 2, 145, 30.32, -4.03, -73.57, 36.82, -38.5],
   'lon':[-34, 49, -38, 59.93, 5.33, 45.52, -1.29, -12.97],
   'name':['Buenos Aires', 'Paris', 'melbourne', 'St Petersbourg', 'Abidjan', 'Montreal', 'Nairobi', 'Salvador'],
   'value':[10,12,40,70,23,43,100,43]
})
data



In [None]:
# Make an empty map
m = folium.Map(location=[20,0], tiles="Mapbox Bright", zoom_start=2)
 
# I can add marker one by one on the map
for i in range(0,len(data)):
   folium.Circle(
      location=[data.iloc[i]['lon'], data.iloc[i]['lat']],
      popup=data.iloc[i]['name'],
      radius=data.iloc[i]['value']*10000,
      color='crimson',
      fill=True,
      fill_color='crimson'
   ).add_to(m)
