# Aviation Project

Data from National Transportation Safety Board that includes aviation accident data from 1962 to 2023 about civil aviation accidents and selected incidents in the United States and international waters

Data Cleaning and Analysis:

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

aviation_data = None
state_codes_data = None

In [6]:
aviation_data = pd.read_csv('data/AviationData.csv', encoding='ISO-8859-1', dtype= {'column_name_6': 'int64', 'column_name_7': 'float64', 'column_name_28': 'float64'})
state_codes_data = pd.read_csv('data/USState_Codes.csv')

In [7]:
state_codes_data.head()

Unnamed: 0,US_State,Abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [8]:
aviation_data.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.9222,-81.8781,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [5]:
aviation_data.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [9]:
aviation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50248 non-null  object 
 9   Airport.Name            52790 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87572 non-null  object 
 14  Make                    88826 non-null

In [38]:
airplane_df['Make'].value_counts()

CESSNA                        4867
Cessna                        3608
PIPER                         2805
Piper                         1910
BOEING                        1037
                              ... 
Waco Classic Aircraft Corp       1
Lee Harold Swarthout             1
KOSTRAZEWA ANDRE                 1
SIEGEL GERALD                    1
BAILEY ROBERT                    1
Name: Make, Length: 3874, dtype: int64

In [7]:
aviation_data.isna().sum()

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                     52
Country                     226
Latitude                  54507
Longitude                 54516
Airport.Code              38641
Airport.Name              36099
Injury.Severity            1000
Aircraft.damage            3194
Aircraft.Category         56602
Registration.Number        1317
Make                         63
Model                        92
Amateur.Built               102
Number.of.Engines          6084
Engine.Type                7077
FAR.Description           56866
Schedule                  76307
Purpose.of.flight          6192
Air.carrier               72241
Total.Fatal.Injuries      11401
Total.Serious.Injuries    12510
Total.Minor.Injuries      11933
Total.Uninjured            5912
Weather.Condition          4492
Broad.phase.of.flight     27165
Report.Status              6381
Publication.Date          13771
dtype: i

In [25]:
aviation_data.head()
#aviation_data['State'] = aviation_data['Location']
#aviation_data['StateAbbr'] = aviation_data['Location'].str[-2:]
#state_abbr = aviation_data['StateAbbr']

In [47]:
#aviation_data['StateAbbr'] = state_codes_data['Abbreviation'] in aviation_data['StateAbbr']

In [26]:
aviation_data['Aircraft.Category'].value_counts()
airplane_df = aviation_data[aviation_data['Aircraft.Category'] == "Airplane"]

In [40]:
airplane_df.info()
airplane_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27617 entries, 5 to 88886
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                27617 non-null  object 
 1   Investigation.Type      27617 non-null  object 
 2   Accident.Number         27617 non-null  object 
 3   Event.Date              27617 non-null  object 
 4   Location                27610 non-null  object 
 5   Country                 27610 non-null  object 
 6   Latitude                22092 non-null  object 
 7   Longitude               22083 non-null  object 
 8   Airport.Code            17773 non-null  object 
 9   Airport.Name            18256 non-null  object 
 10  Injury.Severity         26803 non-null  object 
 11  Aircraft.damage         26335 non-null  object 
 12  Aircraft.Category       27617 non-null  object 
 13  Registration.Number     27391 non-null  object 
 14  Make                    27608 non-null

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                      7
Country                       7
Latitude                   5525
Longitude                  5534
Airport.Code               9844
Airport.Name               9361
Injury.Severity             814
Aircraft.damage            1282
Aircraft.Category             0
Registration.Number         226
Make                          9
Model                        31
Amateur.Built                17
Number.of.Engines          2754
Engine.Type                4226
FAR.Description             499
Schedule                  24627
Purpose.of.flight          3739
Air.carrier               16350
Total.Fatal.Injuries       3165
Total.Serious.Injuries     3224
Total.Minor.Injuries       2878
Total.Uninjured             900
Weather.Condition          3053
Broad.phase.of.flight     21209
Report.Status              4971
Publication.Date           1001
dtype: i

In [44]:
##removing null values of make/model
airplane_df.dropna(subset=["Make", 'Model'])

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
5,20170710X52551,Accident,NYC79AA106,1979-09-17,"BOSTON, MA",United States,42.4453,-70.7583,,,...,,Air Canada,,,1.0,44.0,VMC,Climb,Probable Cause,19-09-2017
7,20020909X01562,Accident,SEA82DA022,1982-01-01,"PULLMAN, WA",United States,,,,BLACKBURN AG STRIP,...,Personal,,0.0,0.0,0.0,2.0,VMC,Takeoff,Probable Cause,01-01-1982
8,20020909X01561,Accident,NYC82DA015,1982-01-01,"EAST HANOVER, NJ",United States,,,N58,HANOVER,...,Business,,0.0,0.0,0.0,2.0,IMC,Landing,Probable Cause,01-01-1982
12,20020917X02148,Accident,FTW82FRJ07,1982-01-02,"HOMER, LA",United States,,,,,...,Personal,,0.0,0.0,1.0,0.0,IMC,Cruise,Probable Cause,02-01-1983
13,20020917X02134,Accident,FTW82FRA14,1982-01-02,"HEARNE, TX",United States,,,T72,HEARNE MUNICIPAL,...,Personal,,1.0,0.0,0.0,0.0,IMC,Takeoff,Probable Cause,02-01-1983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88869,20221213106455,Accident,WPR23LA065,2022-12-13,"Lewistown, MT",United States,047257N,0109280W,KLWT,Lewiston Municipal Airport,...,,,0.0,0.0,0.0,1.0,,,,14-12-2022
88873,20221215106463,Accident,ERA23LA090,2022-12-14,"San Juan, PR",United States,182724N,0066554W,SIG,FERNANDO LUIS RIBAS DOMINICCI,...,Personal,SKY WEST AVIATION INC TRUSTEE,0.0,0.0,0.0,1.0,VMC,,,27-12-2022
88876,20221219106475,Accident,WPR23LA069,2022-12-15,"Wichita, KS",United States,373829N,0972635W,ICT,WICHITA DWIGHT D EISENHOWER NT,...,,,0.0,0.0,0.0,1.0,,,,19-12-2022
88877,20221219106470,Accident,ERA23LA091,2022-12-16,"Brooksville, FL",United States,282825N,0822719W,BKV,BROOKSVILLE-TAMPA BAY RGNL,...,Personal,GERBER RICHARD E,0.0,1.0,0.0,0.0,VMC,,,23-12-2022


In [53]:
airplane_df.dropna(subset=['Injury.Severity'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df.dropna(subset=['Injury.Severity'], inplace=True)


Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                      7
Country                       7
Latitude                   4879
Longitude                  4888
Airport.Code               9206
Airport.Name               8734
Injury.Severity               0
Aircraft.damage             863
Aircraft.Category             0
Registration.Number         219
Make                          0
Model                         0
Amateur.Built                17
Number.of.Engines          2203
Engine.Type                3579
FAR.Description             355
Schedule                  24123
Purpose.of.flight          2954
Air.carrier               15953
Total.Fatal.Injuries       3159
Total.Serious.Injuries     3216
Total.Minor.Injuries       2871
Total.Uninjured             894
Weather.Condition          2308
Broad.phase.of.flight     20383
Report.Status              4233
Publication.Date            802
dtype: i

In [81]:
airplane_df['Make'].value_counts()

CESSNA                          4793
Cessna                          3601
PIPER                           2779
Piper                           1907
BEECH                           1003
                                ... 
Taylor                             1
ERTZ MARK G                        1
Aeronca Aircraft Corporation       1
BARNETT ALLEN S                    1
BAILEY ROBERT                      1
Name: Make, Length: 3854, dtype: int64

In [74]:
airplane_df['Aircraft.damage'].value_counts()
airplane_df['Report.Status'].value_counts()

Probable Cause                                                                                                                                                                                                                                                         6386
Foreign                                                                                                                                                                                                                                                                 320
<br /><br />                                                                                                                                                                                                                                                            117
The pilot's failure to maintain directional control during the landing roll.                                                                                                                        

In [79]:
#create status_df that contains all the reports with the status of pilot's failure
#drop null values of report status
status_df = airplane_df.dropna(subset=['Report.Status'])

#count rows that contain pilot's failure
print(status_df['Report.Status'].str.contains("pilot's failure").sum())
#status_df = status_df[status_df['Report.Status'].str.contains("pilot's failure")]
status_df.info()

3594
<class 'pandas.core.frame.DataFrame'>
Int64Index: 22535 entries, 5 to 88767
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                22535 non-null  object 
 1   Investigation.Type      22535 non-null  object 
 2   Accident.Number         22535 non-null  object 
 3   Event.Date              22535 non-null  object 
 4   Location                22528 non-null  object 
 5   Country                 22528 non-null  object 
 6   Latitude                18793 non-null  object 
 7   Longitude               18783 non-null  object 
 8   Airport.Code            15601 non-null  object 
 9   Airport.Name            16235 non-null  object 
 10  Injury.Severity         22535 non-null  object 
 11  Aircraft.damage         22039 non-null  object 
 12  Aircraft.Category       22535 non-null  object 
 13  Registration.Number     22320 non-null  object 
 14  Make                    22535 non

In [75]:
status_df.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
63914,20080109X00036,Accident,DFW08CA054,2008-01-01,"Arcola, TX",United States,293022N,0952836W,AXH,HOUSTON-SOUTHWEST,...,Personal,,0.0,0.0,0.0,1.0,VMC,,The pilot's failure to maintain directional co...,25-09-2020
63921,20080115X00047,Accident,MIA08LA035,2008-01-05,"Spotsylvania, VA",United States,038128N,0773515W,,,...,Personal,Neil T. Wallace,0.0,1.0,0.0,0.0,VMC,,The pilot's failure to follow the checklist an...,25-09-2020
63927,20080312X00299,Accident,MIA08CA037,2008-01-06,"Ellsworth, ME",United States,044387N,0683757W,NONE,Philbrick Mountain,...,Personal,,0.0,0.0,0.0,2.0,VMC,,The pilot's failure to maintain directional co...,25-09-2020
63930,20080220X00212,Accident,LAX08CA044,2008-01-07,"Exeter, CA",United States,361419N,0119835W,O63,THUNDERHAWK FIELD,...,Personal,,0.0,0.0,0.0,2.0,VMC,,The pilot's failure to maintain directional co...,25-09-2020
63936,20080117X00068,Accident,SEA08LA058,2008-01-10,"Perris, CA",United States,033535N,0117158W,,,...,Personal,,1.0,0.0,0.0,0.0,VMC,,The pilot's failure to maintain clearance from...,25-09-2020
