## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
wingman_data = pd.read_csv('raw_data/wingman_data.csv')

In [3]:
null = []
for column in wingman_data.columns:
    na = wingman_data[column].isnull().value_counts()
    null.append(na)

## Dropping duplicates

In [4]:
wingman_data = wingman_data.drop_duplicates()

## Dropping rows and columns

In [5]:
wingman_data_cleaned = wingman_data.dropna(subset=['acft_make', 'acft_model', 'acft_category'], how='any')
wingman_data_cleaned.drop(['afm_hrs_last_insp', 'elt_install', 'elt_type', 'oper_dba', 'crew_tox_perf', 'mr_faa_med_certf', 'eng_model', 'propeller_type', 'available_restraint', 'eng_no'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wingman_data_cleaned.drop(['afm_hrs_last_insp', 'elt_install', 'elt_type', 'oper_dba', 'crew_tox_perf', 'mr_faa_med_certf', 'eng_model', 'propeller_type', 'available_restraint', 'eng_no'], axis=1, inplace=True)


In [6]:
wingman_data_cleaned.columns

Index(['id', 'eventsoe_no', 'far_part', 'flt_plan_filed', 'acft_make',
       'acft_model', 'total_seats', 'num_eng', 'fixed_retractable',
       'cert_max_gr_wt', 'acft_category', 'homebuilt', 'type_last_insp',
       'afm_hrs', 'certs_held', 'type_fly', 'second_pilot', 'dprt_apt_id',
       'dprt_time', 'dest_apt_id', 'site_seeing', 'air_medical',
       'flight_hours_mean', 'crew_category', 'crew_sex', 'pc_profession',
       'eng_type', 'eng_mfgr', 'power_units', 'hp_or_lbs',
       'carb_fuel_injection'],
      dtype='object')

## Imputing values

In [7]:
features_numeric_1 = ['dprt_time']
features_numeric_2 = ['cert_max_gr_wt', 'afm_hrs', 'total_seats']
features_cat = ['num_eng', 'type_last_insp', 'second_pilot', 'site_seeing', 'air_medical', 'crew_sex']
features_certs = ['certs_held']
features_5 = ['dprt_apt_id', 'dest_apt_id', 'flt_plan_filed']
features_6 = ['pc_profession', 'eng_type', 'carb_fuel_injection', 'type_fly']
features_7 = ['eng_mfgr']

In [8]:
imputer_numeric_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

imputer_numeric_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

imputer_categoric = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='most_frequent'))])

imputer_certs = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='constant', fill_value="N"))])

imputer_5 = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='constant', fill_value="NONE"))])

imputer_6 = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='constant', fill_value="UNK"))])

imputer_7 = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='constant', fill_value="Other"))])



In [9]:
preprocessor = ColumnTransformer(transformers=[('imputer_numeric_1',
                                                imputer_numeric_1,
                                                features_numeric_1),
                                               ('imputer_numeric_2', 
                                                imputer_numeric_2,
                                                features_numeric_2),
                                               ('imputer_categoric',
                                                imputer_categoric,
                                                features_cat),
                                               ('imputer_certs', 
                                               imputer_certs, 
                                               features_certs),
                                               ('imputer_5', 
                                                imputer_5, 
                                                features_5),
                                               ('imputer_6', 
                                                imputer_6, 
                                                features_6),
                                               ('imputer_7', 
                                               imputer_7, 
                                               features_7)])

preprocessor

In [10]:
preprocessor.fit(wingman_data_cleaned)

wingman_data_preproc = preprocessor.transform(wingman_data_cleaned)

## Merging datasets

In [11]:
c = ['dprt_time', 'cert_max_gr_wt', 'afm_hrs', 'total_seats', 'num_eng', 'type_last_insp', 'second_pilot', 'site_seeing', 'air_medical', 'crew_sex', 
     'certs_held', 'dprt_apt_id', 'dest_apt_id', 'flt_plan_filed', 'pc_profession', 'eng_type', 'carb_fuel_injection', 'type_fly', 'eng_mfgr']

In [12]:
wingman_data_preproc = pd.DataFrame(wingman_data_preproc, columns=c)
wingman_data_cleaned = wingman_data_cleaned.drop(columns=c)
wingman_data_cl_imp = pd.merge(wingman_data_cleaned, wingman_data_preproc, left_index=True, right_index=True)

## Fixing Dtypes

In [13]:
wingman_data_cl_imp['total_seats'] = wingman_data_cl_imp['total_seats'].astype('int64')
wingman_data_cl_imp['num_eng'] = wingman_data_cl_imp['num_eng'].astype('int64')
wingman_data_cl_imp['dprt_time'] = wingman_data_cl_imp['dprt_time'].astype('int64')
wingman_data_cl_imp['cert_max_gr_wt'] = wingman_data_cl_imp['cert_max_gr_wt'].astype('int64')
wingman_data_cl_imp['afm_hrs'] = wingman_data_cl_imp['afm_hrs'].astype('int64')

## Exporting

In [14]:
wingman_data_cl_imp.to_csv('/Users/danielderbyshire/Desktop/wingman_data_cleaned.csv')

## Exploring dataset

In [15]:
wingman_data_cl_imp.set_index('id', inplace=True)

wingman_data_cl_imp.select_dtypes(include='object')

Unnamed: 0_level_0,far_part,acft_make,acft_model,fixed_retractable,acft_category,homebuilt,crew_category,hp_or_lbs,type_last_insp,second_pilot,...,crew_sex,certs_held,dprt_apt_id,dest_apt_id,flt_plan_filed,pc_profession,eng_type,carb_fuel_injection,type_fly,eng_mfgr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20080107X000261,091,PIPER,PA 28-180,FIXD,AIR,N,PLT,HP,ANNL,N,...,M,Y,O69,OQ3,NONE,No,REC,CARB,PERS,Textron Lycoming
20080107X000262,091,Barnard/Stancil,Glastar,FIXD,AIR,Y,PLT,HP,COAW,N,...,M,Y,OQ3,O70,NONE,No,REC,CARB,PERS,Textron Lycoming
20080109X000361,091,Micco Aircraft Company,MAC-145B,RETR,AIR,N,PLT,HP,ANNL,N,...,F,Y,T20,AXH,NONE,No,REC,FINJ,PERS,Lycoming
20080107X000271,091,Pilatus,PC-12/45,RETR,AIR,N,PLT,HP,UNK,N,...,M,Y,PHX,PWA,IFR,UNK,TP,UNK,PERS,Pratt & Whitney Canada
20080115X000511,091,Cessna,152,FIXD,AIR,N,DSTU,HP,ANNL,N,...,M,Y,KFNL,KFNL,NONE,No,REC,CARB,INST,Lycoming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202303131068811,091,BEECH,E-55,RETR,AIR,N,PLT,HP,ANNL,N,...,M,Y,2CO5,PVT,NONE,No,REC,CARB,PERS,Lycoming
202303131068821,091,TRENDAK,TAURUS,FIXD,GYRO,Y,PLT,HP,ANNL,N,...,M,Y,ATW,LOT,IFR,No,REC,FINJ,PERS,Continental
202303131068841,091,MAULE,M-7-235B,FIXD,AIR,N,PLT,HP,COND,N,...,M,Y,3L2,KVGT,NONE,Yes,REC,CARB,PERS,Titan
202303131068851,091,BEECH,F33A,RETR,AIR,N,PLT,HP,ANNL,N,...,M,Y,NONE,NONE,NONE,Yes,TS,UNK,AAPL,ALLISON


In [16]:
wingman_data_cl_imp.select_dtypes(include='object').columns

Index(['far_part', 'acft_make', 'acft_model', 'fixed_retractable',
       'acft_category', 'homebuilt', 'crew_category', 'hp_or_lbs',
       'type_last_insp', 'second_pilot', 'site_seeing', 'air_medical',
       'crew_sex', 'certs_held', 'dprt_apt_id', 'dest_apt_id',
       'flt_plan_filed', 'pc_profession', 'eng_type', 'carb_fuel_injection',
       'type_fly', 'eng_mfgr'],
      dtype='object')

In [17]:
wingman_data_cl_imp.select_dtypes(include='object').columns

Index(['far_part', 'acft_make', 'acft_model', 'fixed_retractable',
       'acft_category', 'homebuilt', 'crew_category', 'hp_or_lbs',
       'type_last_insp', 'second_pilot', 'site_seeing', 'air_medical',
       'crew_sex', 'certs_held', 'dprt_apt_id', 'dest_apt_id',
       'flt_plan_filed', 'pc_profession', 'eng_type', 'carb_fuel_injection',
       'type_fly', 'eng_mfgr'],
      dtype='object')

## All steps

In [18]:

def data_cleaning(X):
    ## drop duplicates
    wingman_data = X.drop_duplicates()

    ## drop rows and columns
    wingman_data_cleaned = wingman_data.dropna(subset=['acft_make', 'acft_model', 'acft_category'], how='any')
    wingman_data_cleaned.drop(['afm_hrs_last_insp', 'elt_install', 'elt_type', 'oper_dba', 'crew_tox_perf', 'mr_faa_med_certf', 'eng_model', 'propeller_type', 'available_restraint', 'eng_no'], axis=1, inplace=True)

    ## imputing
    features_numeric_1 = ['dprt_time']
    features_numeric_2 = ['cert_max_gr_wt', 'afm_hrs', 'total_seats']
    features_cat = ['num_eng', 'type_last_insp', 'second_pilot', 'site_seeing', 'air_medical', 'crew_sex']
    features_certs = ['certs_held']
    features_5 = ['dprt_apt_id', 'dest_apt_id', 'flt_plan_filed']
    features_6 = ['pc_profession', 'eng_type', 'carb_fuel_injection', 'type_fly']
    features_7 = ['eng_mfgr']

    imputer_numeric_1 = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
    ])
    imputer_numeric_2 = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median'))
    ])
    imputer_categoric = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='most_frequent'))])
    imputer_certs = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="N"))])
    imputer_5 = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="NONE"))])
    imputer_6 = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="UNK"))])
    imputer_7 = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="Other"))])

    preprocessor = ColumnTransformer(transformers=[('imputer_numeric_1',
                                                    imputer_numeric_1,
                                                    features_numeric_1),
                                                ('imputer_numeric_2', 
                                                    imputer_numeric_2,
                                                    features_numeric_2),
                                                ('imputer_categoric',
                                                    imputer_categoric,
                                                    features_cat),
                                                ('imputer_certs', 
                                                imputer_certs, 
                                                features_certs),
                                                ('imputer_5', 
                                                    imputer_5, 
                                                    features_5),
                                                ('imputer_6', 
                                                    imputer_6, 
                                                    features_6),
                                                ('imputer_7', 
                                                imputer_7, 
                                                features_7)])
    preprocessor.fit(wingman_data_cleaned)
    wingman_data_preproc = preprocessor.transform(wingman_data_cleaned)
    
    ## merging datasets
    c = ['dprt_time', 'cert_max_gr_wt', 'afm_hrs', 'total_seats', 'num_eng', 'type_last_insp', 'second_pilot', 'site_seeing', 'air_medical', 'crew_sex', 
        'certs_held', 'dprt_apt_id', 'dest_apt_id', 'flt_plan_filed', 'pc_profession', 'eng_type', 'carb_fuel_injection', 'type_fly', 'eng_mfgr']
    wingman_data_preproc = pd.DataFrame(wingman_data_preproc, columns=c)
    wingman_data_cleaned = wingman_data_cleaned.drop(columns=c)
    wingman_data_cl_imp = pd.merge(wingman_data_cleaned, wingman_data_preproc, left_index=True, right_index=True)

    ## fixing dtypes
    wingman_data_cl_imp['total_seats'] = wingman_data_cl_imp['total_seats'].astype('int64')
    wingman_data_cl_imp['num_eng'] = wingman_data_cl_imp['num_eng'].astype('int64')
    wingman_data_cl_imp['dprt_time'] = wingman_data_cl_imp['dprt_time'].astype('int64')
    wingman_data_cl_imp['cert_max_gr_wt'] = wingman_data_cl_imp['cert_max_gr_wt'].astype('int64')
    wingman_data_cl_imp['afm_hrs'] = wingman_data_cl_imp['afm_hrs'].astype('int64')
    
    return wingman_data_cl_imp