## Data Cleaning
___

In [1]:
import pandas as pd

import re

## Data Cleaning & Exploration
___

### Pull and Explore Aviation Data
___

In [2]:
airline_df = pd.read_csv('./data/aviation_data.csv')

In [3]:
airline_df.head()

Unnamed: 0,NtsbNo,EventType,Mkey,EventDate,City,State,Country,ReportNo,N,HasSafetyRec,...,PurposeOfFlight,FAR,AirCraftDamage,WeatherCondition,Operator,ReportStatus,RepGenFlag,DocketUrl,DocketPublishDate,Unnamed: 37
0,DCA24LA051,ACC,193561,2023-12-19T19:30:00Z,St. Louis,Missouri,United States,,N8514F,False,...,,121,Substantial,VMC,SOUTHWEST AIRLINES CO,Completed,False,https://data.ntsb.gov/Docket?ProjectID=193561,3/12/2024 6:00:00 PM,
1,DCA24LA034,INC,193459,2023-11-30T08:14:00Z,Kahului,Hawaii,United States,,N494HA,False,...,,121,Minor,VMC,Hawaiian Airlines,In work,False,,,
2,ENG24FA003,INC,193272,2023-10-18T17:15:00Z,Memphis,Tennessee,United States,,N287FE,False,...,,121,Minor,,FEDERAL EXPRESS CORP,In work,False,,,
3,DCA24FA002,ACC,193196,2023-10-05T00:47:00Z,Chattanooga,Tennessee,United States,,N977FD,False,...,,121,Substantial,,FEDERAL EXPRESS CORP,In work,False,,,
4,DCA23LA468,ACC,193204,2023-09-30T16:00:00Z,Denver,Colorado,United States,,N37560,False,...,,121,Substantial,,UNITED AIRLINES INC,In work,False,,,


Referenced [Stack Overflow](https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case) for changing camel case to snake case on dataframe columns as well as ChatGPT for regex code.

In [4]:
def camel_to_snake(col):
    return re.sub(r'(?<!^)(?=[A-Z])', '_', col).lower()

In [5]:
airline_df.columns = [camel_to_snake(col) for col in airline_df.columns]

In [6]:
airline_df.head(2)

Unnamed: 0,ntsb_no,event_type,mkey,event_date,city,state,country,report_no,n,has_safety_rec,...,purpose_of_flight,f_a_r,air_craft_damage,weather_condition,operator,report_status,rep_gen_flag,docket_url,docket_publish_date,unnamed: 37
0,DCA24LA051,ACC,193561,2023-12-19T19:30:00Z,St. Louis,Missouri,United States,,N8514F,False,...,,121,Substantial,VMC,SOUTHWEST AIRLINES CO,Completed,False,https://data.ntsb.gov/Docket?ProjectID=193561,3/12/2024 6:00:00 PM,
1,DCA24LA034,INC,193459,2023-11-30T08:14:00Z,Kahului,Hawaii,United States,,N494HA,False,...,,121,Minor,VMC,Hawaiian Airlines,In work,False,,,


In [7]:
airline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2563 entries, 0 to 2562
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ntsb_no                2563 non-null   object 
 1   event_type             2563 non-null   object 
 2   mkey                   2563 non-null   int64  
 3   event_date             2563 non-null   object 
 4   city                   2558 non-null   object 
 5   state                  2194 non-null   object 
 6   country                2504 non-null   object 
 7   report_no              85 non-null     object 
 8   n                      2559 non-null   object 
 9   has_safety_rec         2563 non-null   bool   
 10  report_type            2301 non-null   object 
 11  original_publish_date  2059 non-null   object 
 12  highest_injury_level   972 non-null    object 
 13  fatal_injury_count     2563 non-null   int64  
 14  serious_injury_count   2563 non-null   int64  
 15  mino

In [8]:
#list of columns worth keeping to streamline cleaning
columns_to_keep = ['event_type',
                   'event_date',
                   'city', 'state',
                   'country',
                   'highest_injury_level',
                   'fatal_injury_count',
                   'serious_injury_count',
                   'minor_injury_count',
                   'probable_cause',
                   'make',
                   'model',
                   'airport_i_d',
                   'airport_name',
                   'number_of_engines',
                   'air_craft_damage',
                   'weather_condition',
                   'operator',
                  ]

airline_filtered_df = airline_df[columns_to_keep].copy()

In [9]:
airline_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2563 entries, 0 to 2562
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event_type            2563 non-null   object
 1   event_date            2563 non-null   object
 2   city                  2558 non-null   object
 3   state                 2194 non-null   object
 4   country               2504 non-null   object
 5   highest_injury_level  972 non-null    object
 6   fatal_injury_count    2563 non-null   int64 
 7   serious_injury_count  2563 non-null   int64 
 8   minor_injury_count    2563 non-null   int64 
 9   probable_cause        1828 non-null   object
 10  make                  2559 non-null   object
 11  model                 2559 non-null   object
 12  airport_i_d           1612 non-null   object
 13  airport_name          1634 non-null   object
 14  number_of_engines     2348 non-null   object
 15  air_craft_damage      1519 non-null   

In [10]:
# Change airport_i_d to airport_id and air_craft_damage to aircraft_damage
airline_filtered_df.rename(columns = {'airport_i_d':'airport_id', 'air_craft_damage':'aircraft_damage'}, inplace = True)

In [11]:
airline_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2563 entries, 0 to 2562
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event_type            2563 non-null   object
 1   event_date            2563 non-null   object
 2   city                  2558 non-null   object
 3   state                 2194 non-null   object
 4   country               2504 non-null   object
 5   highest_injury_level  972 non-null    object
 6   fatal_injury_count    2563 non-null   int64 
 7   serious_injury_count  2563 non-null   int64 
 8   minor_injury_count    2563 non-null   int64 
 9   probable_cause        1828 non-null   object
 10  make                  2559 non-null   object
 11  model                 2559 non-null   object
 12  airport_id            1612 non-null   object
 13  airport_name          1634 non-null   object
 14  number_of_engines     2348 non-null   object
 15  aircraft_damage       1519 non-null   

In [12]:
# Only see five instances of 'OCC' and they all occur at very early rows in the dataset.  None of them look to have any injuries
# listed, so I will replace all values of 'OCC' with 'INC'.
airline_filtered_df['event_type'].value_counts()

event_type
ACC    1311
INC    1247
OCC       5
Name: count, dtype: int64

In [13]:
airline_filtered_df['event_type'] = airline_filtered_df['event_type'].str.replace('OCC', 'INC')

In [14]:
airline_filtered_df['event_type'].value_counts()

event_type
ACC    1311
INC    1252
Name: count, dtype: int64

In [15]:
# Only want the dates in the event date column and then change it to date time
airline_filtered_df['event_date']

0       2023-12-19T19:30:00Z
1       2023-11-30T08:14:00Z
2       2023-10-18T17:15:00Z
3       2023-10-05T00:47:00Z
4       2023-09-30T16:00:00Z
                ...         
2558    1982-01-23T21:37:00Z
2559    1982-01-22T10:55:00Z
2560    1982-01-13T18:01:00Z
2561    1982-01-13T05:00:00Z
2562    1982-01-12T20:30:00Z
Name: event_date, Length: 2563, dtype: object

In [19]:
airline_filtered_df['event_date'] = airline_filtered_df['event_date'].str[:10]
airline_filtered_df['event_date'] = airline_filtered_df['event_date'].str[:10]