## Imports

In [1]:
# IMPORTS
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

## Load Data

In [11]:
wingman_df = pd.read_csv('../raw_data/trimmed_data/wingman_data_cleaned.csv')

## Testing encoders before integrating into preprocessor.py

In [12]:
mask = wingman_df['hp_or_lbs'] == 'HP'

wingman_df = wingman_df[mask]

wingman_df.drop(columns=['hp_or_lbs'], axis=1, inplace=True)

wingman_df['power_units'].fillna(wingman_df['power_units'].median(), inplace=True)

In [21]:
temp = wingman_df.copy()

### temp DF Info Monitor

In [22]:
temp.shape

(16722, 30)

In [15]:
temp.nunique()

id                     16722
eventsoe_no               85
far_part                  13
acft_make               3002
acft_model              3840
fixed_retractable          2
acft_category              8
homebuilt                  2
flight_hours_mean       6793
crew_category              5
power_units              421
dprt_time               1095
cert_max_gr_wt          1413
afm_hrs                 9621
total_seats              141
num_eng                    6
type_last_insp             6
second_pilot               2
site_seeing                2
air_medical                2
crew_sex                   2
certs_held                 2
dprt_apt_id             5423
dest_apt_id             5049
flt_plan_filed             7
pc_profession              3
eng_type                  11
carb_fuel_injection        3
type_fly                  24
eng_mfgr                 662
dtype: int64

In [16]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16722 entries, 0 to 18806
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   16722 non-null  object 
 1   eventsoe_no          16722 non-null  int64  
 2   far_part             16722 non-null  object 
 3   acft_make            16722 non-null  object 
 4   acft_model           16722 non-null  object 
 5   fixed_retractable    16722 non-null  object 
 6   acft_category        16722 non-null  object 
 7   homebuilt            16722 non-null  object 
 8   flight_hours_mean    16722 non-null  float64
 9   crew_category        16722 non-null  object 
 10  power_units          16722 non-null  float64
 11  dprt_time            16722 non-null  int64  
 12  cert_max_gr_wt       16722 non-null  float64
 13  afm_hrs              16722 non-null  float64
 14  total_seats          16722 non-null  int64  
 15  num_eng              16722 non-null 

In [17]:
temp.apply(lambda x: x.isin(['Y', 'N']).value_counts()).T.stack()

id                   False    16722.0
eventsoe_no          False    16722.0
far_part             False    16722.0
acft_make            False    16722.0
acft_model           False    16722.0
fixed_retractable    False    16722.0
acft_category        False    16722.0
homebuilt            True     16722.0
flight_hours_mean    False    16722.0
crew_category        False    16722.0
power_units          False    16722.0
dprt_time            False    16722.0
cert_max_gr_wt       False    16722.0
afm_hrs              False    16722.0
total_seats          False    16722.0
num_eng              False    16722.0
type_last_insp       False    16722.0
second_pilot         True     16722.0
site_seeing          True     16722.0
air_medical          True     16722.0
crew_sex             False    16722.0
certs_held           True     16722.0
dprt_apt_id          False    16722.0
dest_apt_id          False    16722.0
flt_plan_filed       False    16722.0
pc_profession        False    16722.0
eng_type    

### Testing

##### transform_yes_no /

In [None]:
def transform_yes_no(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms the 'Y' and 'N' values to 1 and 0 respectively."""
    X = X.replace({'Y': 1, 'N': 0}, inplace=True)
    return X

transform_yes_no(temp)

temp

##### transform_gender /

In [None]:
def transform_gender(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms 'M" and 'F' values to 1 and 0 respectively."""
    X = X.replace({'M': 1, 'F': 0}, inplace=True)
    return X

transform_gender(temp)

temp

##### transform_type_insp

In [10]:
def transform_type_insp(X: pd.DataFrame) -> np.ndarray:
    """Transforms Inspection types: ANNL, 100H, COND, COAW, AAIP using OHE."""

    # Replace 'UNK' with 'ANNL' in the 'type_last_insp' column
    X.replace({'type_last_insp': {'UNK': 'ANNL'}}, inplace=True)

    ohe = OneHotEncoder(sparse=False, drop='if_binary').fit(X[['type_last_insp']])
    type_insp_encoded = ohe.transform(X[['type_last_insp']])

    type_insp_encoded_df = pd.DataFrame(type_insp_encoded, columns=ohe.get_feature_names_out())

    X[ohe.get_feature_names_out()] = ohe.transform(X[['type_last_insp']])
    X.drop(columns=['type_last_insp'], inplace=True)
    
    return X

temp = transform_type_insp(temp)

temp



Unnamed: 0,id,eventsoe_no,acft_make,acft_model,fixed_retractable,acft_category,homebuilt,flight_hours_mean,crew_category,power_units,...,eng_mfgr,far_part_091,far_part_135,far_part_137,far_part_infrequent_sklearn,type_last_insp_100H,type_last_insp_AAIP,type_last_insp_ANNL,type_last_insp_COAW,type_last_insp_COND
0,20080107X000261,250,PIPER,PA 28-180,FIXD,AIR,N,705.000000,PLT,180.0,...,Textron Lycoming,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,20080107X000262,250,Barnard/Stancil,Glastar,FIXD,AIR,Y,8300.000000,PLT,180.0,...,Textron Lycoming,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20080109X000361,230,Micco Aircraft Company,MAC-145B,RETR,AIR,N,1000.000000,PLT,260.0,...,Lycoming,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,20080107X000271,81,Pilatus,PC-12/45,RETR,AIR,N,10571.000000,PLT,1200.0,...,Pratt & Whitney Canada,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,20080115X000511,240,Cessna,152,FIXD,AIR,N,18.000000,DSTU,110.0,...,Lycoming,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16715,20200616X024101,220,Air Tractor,AT502,FIXD,AIR,N,4500.000000,PLT,750.0,...,Lycoming,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
16716,20200616X529141,341,Cessna,180,FIXD,AIR,N,1501.000000,PLT,285.0,...,Rotax,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16717,20200617X857041,220,Robinson,R22,FIXD,HELI,N,312.000000,PLT,145.0,...,Lycoming,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
16718,20200617X320491,30,DESTINY,XLT,FIXD,PPAR,Y,2.000000,PLT,65.0,...,Continental,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


##### transform_type_fly

##### transform_eng_mfgr

##### transform_far_part

In [27]:
def transform_far_part(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms far_part using OHE."""

    ohe_far_part = OneHotEncoder(sparse_output=False, min_frequency=300, drop='if_binary').fit(X)
    far_part_encoded = ohe_far_part.transform(X)
    
    far_part_encoded_df = pd.DataFrame(far_part_encoded, columns=ohe_far_part.get_feature_names_out())
    far_part_encoded_df.index = X.index  # preserve original DataFrame index
    
    X = pd.concat([X, far_part_encoded_df], axis=1)  # use pd.concat instead of pd.merge
    X.drop(columns=['far_part'], inplace=True)
    
    return X

t = transform_far_part(temp)

t

Unnamed: 0,id,eventsoe_no,acft_make,acft_model,fixed_retractable,acft_category,homebuilt,flight_hours_mean,crew_category,power_units,...,flt_plan_filed,pc_profession,eng_type,carb_fuel_injection,type_fly,eng_mfgr,far_part_091,far_part_135,far_part_137,far_part_infrequent_sklearn
0,20080107X000261,250,PIPER,PA 28-180,FIXD,AIR,N,705.000000,PLT,180.0,...,NONE,No,REC,CARB,PERS,Textron Lycoming,1.0,0.0,0.0,0.0
1,20080107X000262,250,Barnard/Stancil,Glastar,FIXD,AIR,Y,8300.000000,PLT,180.0,...,NONE,No,REC,CARB,PERS,Textron Lycoming,1.0,0.0,0.0,0.0
2,20080109X000361,230,Micco Aircraft Company,MAC-145B,RETR,AIR,N,1000.000000,PLT,260.0,...,NONE,No,REC,FINJ,PERS,Lycoming,1.0,0.0,0.0,0.0
3,20080107X000271,81,Pilatus,PC-12/45,RETR,AIR,N,10571.000000,PLT,1200.0,...,IFR,UNK,TP,UNK,PERS,Pratt & Whitney Canada,1.0,0.0,0.0,0.0
4,20080115X000511,240,Cessna,152,FIXD,AIR,N,18.000000,DSTU,110.0,...,NONE,No,REC,CARB,INST,Lycoming,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18802,202303131068811,901,BEECH,E-55,RETR,AIR,N,10235.000000,PLT,285.0,...,NONE,No,REC,CARB,PERS,Lycoming,1.0,0.0,0.0,0.0
18803,202303131068821,470,TRENDAK,TAURUS,FIXD,GYRO,Y,550.299988,PLT,160.0,...,IFR,No,REC,FINJ,PERS,Continental,1.0,0.0,0.0,0.0
18804,202303131068841,96,MAULE,M-7-235B,FIXD,AIR,N,632.000000,PLT,235.0,...,NONE,Yes,REC,CARB,PERS,Titan,1.0,0.0,0.0,0.0
18805,202303131068851,191,BEECH,F33A,RETR,AIR,N,1655.000000,PLT,300.0,...,NONE,Yes,TS,UNK,AAPL,ALLISON,1.0,0.0,0.0,0.0


In [24]:
temp

Unnamed: 0,id,eventsoe_no,far_part,acft_make,acft_model,fixed_retractable,acft_category,homebuilt,flight_hours_mean,crew_category,...,crew_sex,certs_held,dprt_apt_id,dest_apt_id,flt_plan_filed,pc_profession,eng_type,carb_fuel_injection,type_fly,eng_mfgr
0,20080107X000261,250,091,PIPER,PA 28-180,FIXD,AIR,N,705.000000,PLT,...,M,Y,O69,OQ3,NONE,No,REC,CARB,PERS,Textron Lycoming
1,20080107X000262,250,091,Barnard/Stancil,Glastar,FIXD,AIR,Y,8300.000000,PLT,...,M,Y,OQ3,O70,NONE,No,REC,CARB,PERS,Textron Lycoming
2,20080109X000361,230,091,Micco Aircraft Company,MAC-145B,RETR,AIR,N,1000.000000,PLT,...,F,Y,T20,AXH,NONE,No,REC,FINJ,PERS,Lycoming
3,20080107X000271,81,091,Pilatus,PC-12/45,RETR,AIR,N,10571.000000,PLT,...,M,Y,PHX,PWA,IFR,UNK,TP,UNK,PERS,Pratt & Whitney Canada
4,20080115X000511,240,091,Cessna,152,FIXD,AIR,N,18.000000,DSTU,...,M,Y,KFNL,KFNL,NONE,No,REC,CARB,INST,Lycoming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18802,202303131068811,901,091,BEECH,E-55,RETR,AIR,N,10235.000000,PLT,...,M,Y,2CO5,PVT,NONE,No,REC,CARB,PERS,Lycoming
18803,202303131068821,470,091,TRENDAK,TAURUS,FIXD,GYRO,Y,550.299988,PLT,...,M,Y,ATW,LOT,IFR,No,REC,FINJ,PERS,Continental
18804,202303131068841,96,091,MAULE,M-7-235B,FIXD,AIR,N,632.000000,PLT,...,M,Y,3L2,KVGT,NONE,Yes,REC,CARB,PERS,Titan
18805,202303131068851,191,091,BEECH,F33A,RETR,AIR,N,1655.000000,PLT,...,M,Y,NONE,NONE,NONE,Yes,TS,UNK,AAPL,ALLISON


##### transform_acft_make

##### transform_fixed_retractable

##### transform_acft_category

##### transform_homebuilt

##### transform_crew_category

##### transform_eng_type

##### transform_carb_fuel_injection

##### transform_dprt_dest_apt_id

##### transform_flt_plan_filed **

##### transform_pc_professional