## Imports

In [61]:
# IMPORTS
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

## Load Data

In [60]:
wingman_df = pd.read_csv('../raw_data/trimmed_data/wingman_data_cleaned.csv')

## Testing encoders before integrating into preprocessor.py

In [62]:
mask = wingman_df['hp_or_lbs'] == 'HP'

wingman_df = wingman_df[mask]

wingman_df.drop(columns=['hp_or_lbs'], axis=1, inplace=True)

wingman_df['power_units'].fillna(wingman_df['power_units'].median(), inplace=True)

In [63]:
temp = wingman_df.copy()

# Set the index for temp as 'id'
temp.set_index('id', inplace=True)

### temp DF Info Monitor

In [64]:
temp.shape

(16722, 29)

In [65]:
temp.nunique()

eventsoe_no              85
far_part                 13
acft_make              3002
acft_model             3840
fixed_retractable         2
acft_category             8
homebuilt                 2
flight_hours_mean      6793
crew_category             5
power_units             421
dprt_time              1095
cert_max_gr_wt         1413
afm_hrs                9621
total_seats             141
num_eng                   6
type_last_insp            6
second_pilot              2
site_seeing               2
air_medical               2
crew_sex                  2
certs_held                2
dprt_apt_id            5423
dest_apt_id            5049
flt_plan_filed            7
pc_profession             3
eng_type                 11
carb_fuel_injection       3
type_fly                 24
eng_mfgr                662
dtype: int64

In [66]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16722 entries, 20080107X000261 to 202303241069471
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   eventsoe_no          16722 non-null  int64  
 1   far_part             16722 non-null  object 
 2   acft_make            16722 non-null  object 
 3   acft_model           16722 non-null  object 
 4   fixed_retractable    16722 non-null  object 
 5   acft_category        16722 non-null  object 
 6   homebuilt            16722 non-null  object 
 7   flight_hours_mean    16722 non-null  float64
 8   crew_category        16722 non-null  object 
 9   power_units          16722 non-null  float64
 10  dprt_time            16722 non-null  int64  
 11  cert_max_gr_wt       16722 non-null  float64
 12  afm_hrs              16722 non-null  float64
 13  total_seats          16722 non-null  int64  
 14  num_eng              16722 non-null  int64  
 15  type_last_insp   

In [67]:
temp.apply(lambda x: x.isin(['Y', 'N']).value_counts()).T.stack()

eventsoe_no          False    16722.0
far_part             False    16722.0
acft_make            False    16722.0
acft_model           False    16722.0
fixed_retractable    False    16722.0
acft_category        False    16722.0
homebuilt            True     16722.0
flight_hours_mean    False    16722.0
crew_category        False    16722.0
power_units          False    16722.0
dprt_time            False    16722.0
cert_max_gr_wt       False    16722.0
afm_hrs              False    16722.0
total_seats          False    16722.0
num_eng              False    16722.0
type_last_insp       False    16722.0
second_pilot         True     16722.0
site_seeing          True     16722.0
air_medical          True     16722.0
crew_sex             False    16722.0
certs_held           True     16722.0
dprt_apt_id          False    16722.0
dest_apt_id          False    16722.0
flt_plan_filed       False    16722.0
pc_profession        False    16722.0
eng_type             False    16722.0
carb_fuel_in

### Testing

##### transform_yes_no

In [80]:
def transform_yes_no(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms the 'yes' and 'no' values to 1 and 0 respectively."""

    yn_categories = ["N", "Y"]
    bin_encoder_1 = OrdinalEncoder(categories=[yn_categories])


    return pd.DataFrame(bin_encoder_1.fit_transform(X), columns=X.columns)

t = transform_yes_no(temp[['air_medical']])
t

Unnamed: 0,air_medical
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
16717,0.0
16718,0.0
16719,0.0
16720,0.0


##### transform_gender

##### transform_type_insp

##### transform_type_fly

##### transform_eng_mfgr

##### transform_far_part

In [68]:
def transform_far_part(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms far_part using OHE."""

    ohe_far_part = OneHotEncoder(sparse_output=False, min_frequency=300).fit(X)
    far_part_encoded = ohe_far_part.transform(X)
    
    far_part_encoded_df = pd.DataFrame(far_part_encoded, columns=ohe_far_part.get_feature_names_out())
    far_part_encoded_df.index = X.index
    
    return far_part_encoded_df

t = transform_far_part(temp[['far_part']])
t

Unnamed: 0_level_0,far_part_091,far_part_135,far_part_137,far_part_infrequent_sklearn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20080107X000261,1.0,0.0,0.0,0.0
20080107X000262,1.0,0.0,0.0,0.0
20080109X000361,1.0,0.0,0.0,0.0
20080107X000271,1.0,0.0,0.0,0.0
20080115X000511,1.0,0.0,0.0,0.0
...,...,...,...,...
202303131068811,1.0,0.0,0.0,0.0
202303131068821,1.0,0.0,0.0,0.0
202303131068841,1.0,0.0,0.0,0.0
202303131068851,1.0,0.0,0.0,0.0


##### transform_acft_make

##### transform_fixed_retractable

##### transform_acft_category

##### transform_homebuilt

##### transform_crew_category

In [69]:
def transform_crew_category(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms crew_category using Custom functions and OHE."""

    X = X.replace({'KPLT':'PLT', 'CPLT':'PLT'})
    
    ohe_crew_cat = OneHotEncoder(sparse_output=False).fit(X)
    crew_cat_enc = ohe_crew_cat.transform(X)
    
    crew_cat_enc_df = pd.DataFrame(crew_cat_enc, columns=ohe_crew_cat.get_feature_names_out())
    crew_cat_enc_df.index = X.index
    
    return crew_cat_enc_df

t = transform_crew_category(temp[['crew_category']])
t

Unnamed: 0_level_0,crew_category_DSTU,crew_category_FLTI,crew_category_PLT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20080107X000261,0.0,0.0,1.0
20080107X000262,0.0,0.0,1.0
20080109X000361,0.0,0.0,1.0
20080107X000271,0.0,0.0,1.0
20080115X000511,1.0,0.0,0.0
...,...,...,...
202303131068811,0.0,0.0,1.0
202303131068821,0.0,0.0,1.0
202303131068841,0.0,0.0,1.0
202303131068851,0.0,0.0,1.0


##### transform_eng_type

##### transform_carb_fuel_injection

##### transform_dprt_dest_apt_id

##### transform_flt_plan_filed

##### transform_pc_professional

In [71]:
temp

Unnamed: 0_level_0,eventsoe_no,far_part,acft_make,acft_model,fixed_retractable,acft_category,homebuilt,flight_hours_mean,crew_category,power_units,...,crew_sex,certs_held,dprt_apt_id,dest_apt_id,flt_plan_filed,pc_profession,eng_type,carb_fuel_injection,type_fly,eng_mfgr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20080107X000261,250,091,PIPER,PA 28-180,FIXD,AIR,N,705.000000,PLT,180.0,...,M,Y,O69,OQ3,NONE,No,REC,CARB,PERS,Textron Lycoming
20080107X000262,250,091,Barnard/Stancil,Glastar,FIXD,AIR,Y,8300.000000,PLT,180.0,...,M,Y,OQ3,O70,NONE,No,REC,CARB,PERS,Textron Lycoming
20080109X000361,230,091,Micco Aircraft Company,MAC-145B,RETR,AIR,N,1000.000000,PLT,260.0,...,F,Y,T20,AXH,NONE,No,REC,FINJ,PERS,Lycoming
20080107X000271,81,091,Pilatus,PC-12/45,RETR,AIR,N,10571.000000,PLT,1200.0,...,M,Y,PHX,PWA,IFR,UNK,TP,UNK,PERS,Pratt & Whitney Canada
20080115X000511,240,091,Cessna,152,FIXD,AIR,N,18.000000,DSTU,110.0,...,M,Y,KFNL,KFNL,NONE,No,REC,CARB,INST,Lycoming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202303131068811,901,091,BEECH,E-55,RETR,AIR,N,10235.000000,PLT,285.0,...,M,Y,2CO5,PVT,NONE,No,REC,CARB,PERS,Lycoming
202303131068821,470,091,TRENDAK,TAURUS,FIXD,GYRO,Y,550.299988,PLT,160.0,...,M,Y,ATW,LOT,IFR,No,REC,FINJ,PERS,Continental
202303131068841,96,091,MAULE,M-7-235B,FIXD,AIR,N,632.000000,PLT,235.0,...,M,Y,3L2,KVGT,NONE,Yes,REC,CARB,PERS,Titan
202303131068851,191,091,BEECH,F33A,RETR,AIR,N,1655.000000,PLT,300.0,...,M,Y,NONE,NONE,NONE,Yes,TS,UNK,AAPL,ALLISON
