# Investigating Airplane Accidents

In this Guided Assignment, we will clean and explore a dataset containing details about airplane accidents between January 1982 to September 2015.

The data provided for this assignment came in a text file where each entry was delimited by a new line. Each data point was separated by a pipe character. So, the first order of business will be to do some preliminary cleaning to get the data into a structure that we can work with. 

In [35]:
aviation_list = []
aviation_data = []

with open('AviationData.txt', 'r') as file:
    for line in file:
        aviation_data.append(line)
        text = line.split('|')
        words = []
        for word in text:
            word = word.strip()
            words.append(word)
        aviation_list.append(words)

print(aviation_data[1]) # 0th row contains the headers
        
print(aviation_list[1])

20150908X74637 | Accident | CEN15LA402 | 09/08/2015 | Freeport, IL | United States | 42.246111 | -89.581945 | KFEP | albertus Airport | Non-Fatal | Substantial | Unknown | N24TL | CLARKE REGINALD W | DRAGONFLY MK |  |  |  | Part 91: General Aviation |  | Personal |  |  | 1 |  |  | VMC | TAKEOFF | Preliminary | 09/09/2015 | 

['20150908X74637', 'Accident', 'CEN15LA402', '09/08/2015', 'Freeport, IL', 'United States', '42.246111', '-89.581945', 'KFEP', 'albertus Airport', 'Non-Fatal', 'Substantial', 'Unknown', 'N24TL', 'CLARKE REGINALD W', 'DRAGONFLY MK', '', '', '', 'Part 91: General Aviation', '', 'Personal', '', '', '1', '', '', 'VMC', 'TAKEOFF', 'Preliminary', '09/09/2015', '']


The data is improved but not yet in a form that makes it particularly useful.

In [36]:
def linear_search(code):
    lax_code = []    
    for row in aviation_list:
        for item in row:
            if item == code:
                lax_code.append(row)
    return lax_code


lin_search = linear_search('LAX94LA336')

print(lin_search[0])

['20001218X45447', 'Accident', 'LAX94LA336', '07/19/1962', 'BRIDGEPORT, CA', 'United States', '', '', '', '', 'Fatal(4)', 'Destroyed', '', 'N5069P', 'PIPER', 'PA24-180', 'No', '1', 'Reciprocating', '', '', 'Personal', '', '4', '0', '0', '0', 'UNK', 'UNKNOWN', 'Probable Cause', '09/19/1996', '']


We are able to search the data for a particular value but the row we are returned is still pretty meaningless. 

In [37]:
def dictionary(l):
    # Clean input and create a list of keys for a dictionary    
    not_yet_keys = l[0].split('|')
    keys = []
    for key in not_yet_keys:
        key = key.strip()
        keys.append(key)
    
    # Get the values for the keys
    values = []
    for n in range(1, len(l)):
        not_yet_values = l[n].split('|')
        clean_values = []
        for value in not_yet_values:
            value = value.strip()
            clean_values.append(value)
        values.append(clean_values)
     
    # Pair the values to the keys
    aviation_dict_list = []
    for y in range(0, len(values)):
        paired = {}
        for x in range(0, len(keys)):        
            paired[keys[x]] = values[y][x]
        aviation_dict_list.append(paired)    
    return aviation_dict_list
        

        
aviation_dict_list = dictionary(aviation_data)
aviation_dict_list[1]

{'': '',
 'Accident Number': 'ERA15LA339',
 'Air Carrier': '',
 'Aircraft Category': 'Weight-Shift',
 'Aircraft Damage': 'Substantial',
 'Airport Code': 'LCI',
 'Airport Name': 'Laconia Municipal Airport',
 'Amateur Built': 'No',
 'Broad Phase of Flight': 'MANEUVERING',
 'Country': 'United States',
 'Engine Type': 'Reciprocating',
 'Event Date': '09/05/2015',
 'Event Id': '20150906X32704',
 'FAR Description': 'Part 91: General Aviation',
 'Injury Severity': 'Fatal(1)',
 'Investigation Type': 'Accident',
 'Latitude': '43.606389',
 'Location': 'Laconia, NH',
 'Longitude': '-71.452778',
 'Make': 'EVOLUTION AIRCRAFT INC',
 'Model': 'REVO',
 'Number of Engines': '1',
 'Publication Date': '09/10/2015',
 'Purpose of Flight': 'Personal',
 'Registration Number': 'N2264X',
 'Report Status': 'Preliminary',
 'Schedule': '',
 'Total Fatal Injuries': '1',
 'Total Minor Injuries': '',
 'Total Serious Injuries': '',
 'Total Uninjured': '',
 'Weather Condition': 'VMC'}

The headers are now associated with their values. We can search this new dictionary again for the same "LAX94LA336" incident from before.

In [38]:
def dict_search(dict_list, target):
    lax_dict = []
    for x in range(0, len(dict_list)):
        for value in dict_list[x].values():
            if value == target:
                lax_dict.append(dict_list[x])
    return lax_dict


lax_dict = dict_search(aviation_dict_list, "LAX94LA336")

lax_dict[0]

{'': '',
 'Accident Number': 'LAX94LA336',
 'Air Carrier': '',
 'Aircraft Category': '',
 'Aircraft Damage': 'Destroyed',
 'Airport Code': '',
 'Airport Name': '',
 'Amateur Built': 'No',
 'Broad Phase of Flight': 'UNKNOWN',
 'Country': 'United States',
 'Engine Type': 'Reciprocating',
 'Event Date': '07/19/1962',
 'Event Id': '20001218X45447',
 'FAR Description': '',
 'Injury Severity': 'Fatal(4)',
 'Investigation Type': 'Accident',
 'Latitude': '',
 'Location': 'BRIDGEPORT, CA',
 'Longitude': '',
 'Make': 'PIPER',
 'Model': 'PA24-180',
 'Number of Engines': '1',
 'Publication Date': '09/19/1996',
 'Purpose of Flight': 'Personal',
 'Registration Number': 'N5069P',
 'Report Status': 'Probable Cause',
 'Schedule': '',
 'Total Fatal Injuries': '4',
 'Total Minor Injuries': '0',
 'Total Serious Injuries': '0',
 'Total Uninjured': '0',
 'Weather Condition': 'UNK'}

# Explore the Data

Now that our data has been better formatted we can begin doing exploratory analysis. 

In [39]:
from collections import Counter

def most_state_accidents(data):
    state_accidents = []
    for x in range(0, len(data)):
        state_accidents.append(data[x]['Location'][-2:])
    state_count = Counter(state_accidents)
    return state_accidents, state_count.most_common(5)

state_accidents, accident_prone_states = most_state_accidents(aviation_dict_list)

accident_prone_states

[('CA', 8032), ('FL', 5118), ('TX', 5112), ('AK', 5049), ('AZ', 2502)]

The states with the highest number of aiplane accidents are: California, Flordia, Texas, Alaska, and Arizona. Unfortunately, this dataset does not have the number of successful plane rides/flight hours and so we cannot create a rate statistic for a more fair comparison between states.

Next, we will look at which months have the most accidents.

In [40]:
def worst_month_accidents(data):
    months = []
    change_month = {"01":"January",
                    "02":"February",
                    "03":"March",
                    "04":"April",
                    "05":"May",
                    "06":"June",
                    "07":"July",
                    "08":"August",
                    "09":"September",
                    "10":"October",
                    "11":"November",
                    "12":"December"}
    
    for x in range(0, len(data)):
        month = data[x]['Event Date'][0:2]
        try:
            month = change_month[month]
        except KeyError:
            month = data[x]['Event Id'][4:6]
            month = change_month[month]
        if data[x]['Event Date'] != '':
            year = data[x]['Event Date'][-4:]
        else:
            year = data[x]['Event Id'][0:4]
        months.append(month + ' ' + year)
        
    worst_months = Counter(months)
    return worst_months, worst_months.most_common(3)

month_count_accidents, worst_3_months_acc = worst_month_accidents(aviation_dict_list)

worst_3_months_acc

[('July 1982', 433), ('August 1983', 421), ('July 1983', 413)]

The months with the most accidents are in the dead of summer and come at the beginning of our dataset. Next we will see if these months are also the ones that had the most injuries.

In [41]:
def worst_month_injuries(data):
    injuries_by_month = {}
    change_month = {"01":"January",
                    "02":"February",
                    "03":"March",
                    "04":"April",
                    "05":"May",
                    "06":"June",
                    "07":"July",
                    "08":"August",
                    "09":"September",
                    "10":"October",
                    "11":"November",
                    "12":"December"}
    for x in range(0, len(data)):
        injuries = 0
        month = data[x]['Event Date'][0:2]
        try: 
            month = change_month[month]
        except KeyError:
            month = data[x]['Event Id'][4:6]
            month = change_month[month]
        if data[x]['Event Date'] != '':
            year = data[x]['Event Date'][-4:]
        else:
            year = data[x]['Event Id'][0:4]
        month = month + ' ' + year
        fatal = data[x]['Total Fatal Injuries']
        serious = data[x]['Total Serious Injuries']
        # Skip the blanks        
        if fatal:
            injuries += int(fatal)
        if serious:
            injuries += int(serious)
        injuries_by_month[month] = injuries
        injuries_by_month = Counter(injuries_by_month)        
        
    return injuries_by_month, injuries_by_month.most_common(3)
           
month_count_injuries, worst_3_months_inj  = worst_month_injuries(aviation_dict_list)

worst_3_months_inj

[('January 2007', 102), ('July 2002', 71), ('November 1993', 5)]

Interestingly, the 3 worst months for injuries are not the same as the 3 worst months for accidents! 

There are many more questions to explore with this data. It would be interesting to see the number of injuries or accidents in a time series chart to see if there is any trend. A histogram showing the frequency of injuries and accidents in each month of the year would show if there is any reason to suspect season effects. 