In [48]:
#Tech Task: Task 2

# Steps: Write the code to perform an ETL process to extract a data set from the supplied source
# Persist outputs and Visualise the data in an accessible format. Source: http://www.planecrashinfo.com/database.htm

# Output:
# Total fatalities between period 1920-2016 period
# Top 3 airlines with the highest rate of incidents
# Year with the highest incidents

import pandas as pd
import numpy as np
import requests
import sqlite3
from bs4 import BeautifulSoup
from time import sleep
import re


In [137]:
#Check the maximum number of crashes across all the years - max year is 1972 with 105 crashes
crashNum = {}

for i in range(1920, 2018):
    crashNum[i] = 0
    maxCrashNum = 105
    url = "http://www.planecrashinfo.com/{}/{}-{}.htm".format(i, i, maxCrashNum)
        
    try:
        r = requests.get(url)
        if r.ok:
            print("{} has >= maxCrashNum".format(i))
        else:
            print(".", end="")
            
    #Insert delay so to avoid connection being refused (Max retries exceeded with URL)
    except requests.exceptions.ConnectionError:
            
            print("Connection refused")
            sleep(5)
        

....................................................1972 has >= maxCrashNum
.............................................

In [4]:
#Get data from crash website and put in crashPages list

crashPages = list()

for year in range(1920, 2018):
    for i in range(1,110):
        url = "http://www.planecrashinfo.com/{}/{}-{}.htm".format(year, year, i)
        try:
            r = requests.get(url)
            if not r.ok:
                break
            else:
                crashPages.append(r)
                print(".", end="")
    
        #Insert delay so to avoid connection being refused (Max retries exceeded with URL)
        except requests.exceptions.ConnectionError:
            
            print("Connection refused")
            sleep(5)
    

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [103]:
#Parse data in crashPages list

recordList = list()
del recordList[:] #clear list

for page in crashPages:
    soup = BeautifulSoup(page.text, 'html.parser')

    table = soup.find('table')
    rows = table.find_all('tr')
    rows = rows[1:]
    
    dataDict = {}

    for row in rows:
        td_items = row.find_all('td')
        td_label = td_items[0].text.strip()
        td_data = td_items[1].text.strip()
        dataDict[td_label] = td_data
    
    #Clean data
    try:
        aboardNum = dataDict['Aboard:'].split(" ")[0] #Just take the first number, don't worry about the passenger/crew breakup
        dataDict['Aboard:'] = int(aboardNum)
        fatalityNum = dataDict['Fatalities:'].split(" ")[0] #Just take the first number, don't worry about the passenger/crew breakup
#         print("{}, ".format(fatalityNum), end="")
        dataDict['Fatalities:'] = int(fatalityNum) 
        year = dataDict['Date:'].split(" ")[2] #Take last portion of date
        dataDict['Year:'] = year    
    except:
#         print('x', end="") #corrupted data
        continue
    
    recordList.append(dataDict)

In [104]:
#Insert into pandas dataframe (crashFrame)
crashFrame.drop(crashFrame.index, inplace=True) #clear dataframe
crashFrame = pd.DataFrame(recordList)

In [148]:
# Total fatalities between 1920-2016
total_fatality_dict = {}
total_fatality_dict['Total fatalities from 1920-2016'] = [crashFrame['Fatalities:'].sum()]
tf_frame = pd.DataFrame(total_fatality_dict)
tf_frame = tf_frame.rename(index={0: 'Total Number:'})

# total_fatalities.to_html("Total fatalities from 1920-2016.html")
# print("Total fatalities from 1920-2016: {}".format(total_fatalities))
tf_frame.head()

Unnamed: 0,Total fatalities from 1920-2016
Total Number:,114575


In [142]:
crashFrame.describe()

Unnamed: 0,Aboard:,Fatalities:
count,5731.0,5731.0
mean,27.494329,19.992148
std,42.847366,32.953992
min,0.0,0.0
25%,5.0,3.0
50%,13.0,9.0
75%,30.0,22.0
max,644.0,583.0


In [140]:
# Top 3 airlines with the highest rate of incidents

airline_incident_groups = crashFrame.groupby(crashFrame['Operator:'])
airline_incident_frame = airline_incident_groups.sum()
airline_incident_frame = airline_incident_frame.sort_values(by='Fatalities:', ascending=False)
top_three_worst_airlines_frame = airline_incident_frame[0:3]
print("Top 3 airlines with the highest rate of incidents between 1920-2016")
top_three_worst_airlines_frame.to_html("Top 3 airlines with the highest rate of incidents between 1920-2016.html")
top_three_worst_airlines_frame.head()

Top 3 airlines with the highest rate of incidents between 1920-2016


Unnamed: 0_level_0,Aboard:,Fatalities:
Operator:,Unnamed: 1_level_1,Unnamed: 2_level_1
Aeroflot,11682,9158
Military - U.S. Air Force,4505,3718
Air France,2886,1743


In [138]:
# Year with the highest incidents
worst_year_groups = crashFrame.groupby(crashFrame['Year:'])
worst_year_frame = pd.DataFrame(worst_year_groups.size())
worst_year_frame.columns = ['# of Incidents']
worst_year_frame = worst_year_frame.sort_values(by='# of Incidents', ascending=False)

print("Year with the highest incidents: 1972 with 105 incidents")

worst_year_frame = worst_year_frame.head(5)
worst_year_frame.to_html("Years with the highest number of plane crash incidents.html")
worst_year_frame.head(5)

Year with the highest incidents: 1972 with 105 incidents


Unnamed: 0_level_0,# of Incidents
Year:,Unnamed: 1_level_1
1972,105
1968,98
1989,94
1967,93
1970,92


In [135]:
# Store data in sqlite db

SQLITE_FILE = "crashData.db"
conn = sqlite3.connect(SQLITE_FILE) #opens sqlite and database file
myCursor = conn.cursor() #provides connection to database
crashFrame.to_sql("crashTable", conn, if_exists="replace")


  chunksize=chunksize, dtype=dtype)
