# Aviation Project

Data from National Transportation Safety Board that includes aviation accident data from 1962 to 2023 about civil aviation accidents and selected incidents in the United States and international waters

Data Cleaning and Analysis:

In [225]:
##import packages 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

aviation_data = None
state_codes_data = None

In [226]:
##load the csv into dataframes

aviation_data = pd.read_csv('data/AviationData.csv', encoding='ISO-8859-1', dtype= {'column_name_6': 'int64', 'column_name_7': 'float64', 'column_name_28': 'float64'})
state_codes_data = pd.read_csv('data/USState_Codes.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [227]:
state_codes_data.head()

Unnamed: 0,US_State,Abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [228]:
aviation_data.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.9222,-81.8781,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [229]:
aviation_data.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [230]:
aviation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50248 non-null  object 
 9   Airport.Name            52790 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87572 non-null  object 
 14  Make                    88826 non-null

In [231]:
airplane_df['Injury.Severity'].value_counts()
airplane_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5456 entries, 7 to 88873
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                5456 non-null   object 
 1   Investigation.Type      5456 non-null   object 
 2   Accident.Number         5456 non-null   object 
 3   Event.Date              5456 non-null   object 
 4   Location                5456 non-null   object 
 5   Country                 5454 non-null   object 
 6   Latitude                4388 non-null   object 
 7   Longitude               4386 non-null   object 
 8   Airport.Code            3802 non-null   object 
 9   Airport.Name            3925 non-null   object 
 10  Injury.Severity         5456 non-null   object 
 11  Aircraft.damage         5456 non-null   object 
 12  Aircraft.Category       5456 non-null   object 
 13  Registration.Number     5439 non-null   object 
 14  Make                    5456 non-null  

## Data Cleaning:

In [235]:
#check how many null values we have
aviation_data.isna().sum()
aviation_data['Aircraft.Category'].value_counts()

Airplane             27617
Helicopter            3440
Glider                 508
Balloon                231
Gyrocraft              173
Weight-Shift           161
Powered Parachute       91
Ultralight              30
Unknown                 14
WSFT                     9
Powered-Lift             5
Blimp                    4
UNK                      2
Rocket                   1
ULTR                     1
Name: Aircraft.Category, dtype: int64

In [236]:
## create airplane_df dataframe that contains only airplane data, as that is all we are interested in.
airplane_df = aviation_data[aviation_data['Aircraft.Category'] == "Airplane"]

In [262]:
##removing null values of make/model
airplane_df = airplane_df.dropna(subset=["Make", 'Model'])
airplane_df = airplane_df.dropna(subset=["Total.Fatal.Injuries", "Total.Serious.Injuries", "Total.Minor.Injuries", "Total.Uninjured"])
airplane_df = airplane_df.dropna(subset=["Weather.Condition"])

In [257]:
#double check model/make is clean
airplane_df['Model'].isna().sum()

0

In [258]:
#drop injury severity and aircraft damage empty rows
airplane_df = airplane_df.dropna(subset=['Injury.Severity', "Aircraft.damage"])

airplane_df.isna().sum()
airplane_df['Make'].value_counts()

cessna                            4502
piper                             1889
beech                              352
cirrus design corp                 189
mooney                             151
boeing                             114
cirrus                             112
aeronca                             85
bellanca                            81
champion                            60
luscombe                            58
grumman                             35
grumman acft eng cor-schweizer      10
Name: Make, dtype: int64

In [248]:
#get rid of makes and models that are frequency less than 50
airplane_df = airplane_df[airplane_df.groupby('Make')['Make'].transform('size') > 50]
airplane_df = airplane_df[airplane_df.groupby('Model')['Model'].transform('size') > 50]

In [251]:
##change all "Make" to lower case
airplane_df['Make'] = airplane_df['Make'].str.lower()

In [253]:
print(airplane_df['Make'].value_counts())
airplane_df['Model'].value_counts()

cessna                            5263
piper                             2193
beech                              384
cirrus design corp                 189
mooney                             169
cirrus                             116
boeing                             115
aeronca                            102
bellanca                            98
luscombe                            75
champion                            70
grumman                             45
grumman acft eng cor-schweizer      10
Name: Make, dtype: int64


172      843
152      442
182      340
172N     309
172S     267
        ... 
U206      53
U206G     53
PA46      52
150G      51
185       51
Name: Model, Length: 74, dtype: int64

In [264]:
airplane_df.isna().sum()

Event.Id                     0
Investigation.Type           0
Accident.Number              0
Event.Date                   0
Location                     0
Country                      3
Latitude                  1363
Longitude                 1363
Airport.Code              2061
Airport.Name              1850
Injury.Severity              0
Aircraft.damage              0
Aircraft.Category            0
Registration.Number          0
Make                         0
Model                        0
Amateur.Built                0
Number.of.Engines          125
Engine.Type                500
FAR.Description              5
Schedule                  6689
Purpose.of.flight          196
Air.carrier               3950
Total.Fatal.Injuries         0
Total.Serious.Injuries       0
Total.Minor.Injuries         0
Total.Uninjured              0
Weather.Condition            0
Broad.phase.of.flight     5710
Report.Status              675
Publication.Date            59
dtype: int64

In [221]:
##airplane_df is clean now. Let's start plotting:
##export airplane_df to csv to look at in tableau

airplane_df['Total.Serious.Injuries'].value_counts()
airplane_df.to_csv('airplane_df.csv')

In [141]:
##create status_df that contains all the reports with the status of pilot's failure
##drop null values of report status
#status_df = airplane_df.dropna(subset=['Report.Status'])

##count rows that contain pilot's failure
#print(status_df['Report.Status'].str.contains("pilot's failure").sum())
#status_df = status_df[status_df['Report.Status'].str.contains("pilot's failure")]
#status_df.info()

In [142]:
#status_df.head()