## Imports

In [1]:
# IMPORTS
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

## Load Data

In [11]:
wingman_df = pd.read_csv('../raw_data/trimmed_data/wingman_data_cleaned.csv')

## Testing encoders before integrating into preprocessor.py

In [12]:
mask = wingman_df['hp_or_lbs'] == 'HP'

wingman_df = wingman_df[mask]

wingman_df.drop(columns=['hp_or_lbs'], axis=1, inplace=True)

wingman_df['power_units'].fillna(wingman_df['power_units'].median(), inplace=True)

In [57]:
temp = wingman_df.copy()

# Set the index for temp as 'id'
temp.set_index('id', inplace=True)

### temp DF Info Monitor

In [58]:
temp.shape

(16722, 29)

In [15]:
temp.nunique()

id                     16722
eventsoe_no               85
far_part                  13
acft_make               3002
acft_model              3840
fixed_retractable          2
acft_category              8
homebuilt                  2
flight_hours_mean       6793
crew_category              5
power_units              421
dprt_time               1095
cert_max_gr_wt          1413
afm_hrs                 9621
total_seats              141
num_eng                    6
type_last_insp             6
second_pilot               2
site_seeing                2
air_medical                2
crew_sex                   2
certs_held                 2
dprt_apt_id             5423
dest_apt_id             5049
flt_plan_filed             7
pc_profession              3
eng_type                  11
carb_fuel_injection        3
type_fly                  24
eng_mfgr                 662
dtype: int64

In [16]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16722 entries, 0 to 18806
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   16722 non-null  object 
 1   eventsoe_no          16722 non-null  int64  
 2   far_part             16722 non-null  object 
 3   acft_make            16722 non-null  object 
 4   acft_model           16722 non-null  object 
 5   fixed_retractable    16722 non-null  object 
 6   acft_category        16722 non-null  object 
 7   homebuilt            16722 non-null  object 
 8   flight_hours_mean    16722 non-null  float64
 9   crew_category        16722 non-null  object 
 10  power_units          16722 non-null  float64
 11  dprt_time            16722 non-null  int64  
 12  cert_max_gr_wt       16722 non-null  float64
 13  afm_hrs              16722 non-null  float64
 14  total_seats          16722 non-null  int64  
 15  num_eng              16722 non-null 

In [17]:
temp.apply(lambda x: x.isin(['Y', 'N']).value_counts()).T.stack()

id                   False    16722.0
eventsoe_no          False    16722.0
far_part             False    16722.0
acft_make            False    16722.0
acft_model           False    16722.0
fixed_retractable    False    16722.0
acft_category        False    16722.0
homebuilt            True     16722.0
flight_hours_mean    False    16722.0
crew_category        False    16722.0
power_units          False    16722.0
dprt_time            False    16722.0
cert_max_gr_wt       False    16722.0
afm_hrs              False    16722.0
total_seats          False    16722.0
num_eng              False    16722.0
type_last_insp       False    16722.0
second_pilot         True     16722.0
site_seeing          True     16722.0
air_medical          True     16722.0
crew_sex             False    16722.0
certs_held           True     16722.0
dprt_apt_id          False    16722.0
dest_apt_id          False    16722.0
flt_plan_filed       False    16722.0
pc_profession        False    16722.0
eng_type    

### Testing

##### transform_yes_no

##### transform_gender

##### transform_type_insp

##### transform_type_fly

##### transform_eng_mfgr

##### transform_far_part

In [56]:
def transform_far_part(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms far_part using OHE."""

    ohe_far_part = OneHotEncoder(sparse_output=False, min_frequency=300).fit(X)
    far_part_encoded = ohe_far_part.transform(X)
    
    far_part_encoded_df = pd.DataFrame(far_part_encoded, columns=ohe_far_part.get_feature_names_out())
    far_part_encoded_df.index = X.index
    
    return far_part_encoded_df

t = transform_far_part(temp[['far_part']])
t

Unnamed: 0_level_0,far_part_091,far_part_135,far_part_137,far_part_infrequent_sklearn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20080107X000261,1.0,0.0,0.0,0.0
20080107X000262,1.0,0.0,0.0,0.0
20080109X000361,1.0,0.0,0.0,0.0
20080107X000271,1.0,0.0,0.0,0.0
20080115X000511,1.0,0.0,0.0,0.0
...,...,...,...,...
202303131068811,1.0,0.0,0.0,0.0
202303131068821,1.0,0.0,0.0,0.0
202303131068841,1.0,0.0,0.0,0.0
202303131068851,1.0,0.0,0.0,0.0


##### transform_acft_make

##### transform_fixed_retractable

##### transform_acft_category

##### transform_homebuilt

##### transform_crew_category

In [55]:
def transform_crew_category(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms crew_category using Custom functions and OHE."""

    X = X.replace({'KPLT':'PLT', 'CPLT':'PLT'})
    
    ohe_crew_cat = OneHotEncoder(sparse_output=False).fit(X)
    crew_cat_enc = ohe_crew_cat.transform(X)
    
    crew_cat_enc_df = pd.DataFrame(crew_cat_enc, columns=ohe_crew_cat.get_feature_names_out())
    crew_cat_enc_df.index = X.index
    
    return crew_cat_enc_df

t = transform_crew_category(temp[['crew_category']])
t

Unnamed: 0_level_0,crew_category_DSTU,crew_category_FLTI,crew_category_PLT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20080107X000261,0.0,0.0,1.0
20080107X000262,0.0,0.0,1.0
20080109X000361,0.0,0.0,1.0
20080107X000271,0.0,0.0,1.0
20080115X000511,1.0,0.0,0.0
...,...,...,...
202303131068811,0.0,0.0,1.0
202303131068821,0.0,0.0,1.0
202303131068841,0.0,0.0,1.0
202303131068851,0.0,0.0,1.0


##### transform_eng_type

##### transform_carb_fuel_injection

##### transform_dprt_dest_apt_id

##### transform_flt_plan_filed

##### transform_pc_professional