## Imports

In [55]:
# IMPORTS
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

## Load Data

In [56]:
wingman_df = pd.read_csv('../raw_data/trimmed_data/wingman_data_cleaned.csv')

In [57]:
mask = wingman_df['hp_or_lbs'] == 'HP'

wingman_df = wingman_df[mask]

wingman_df.drop(columns=['hp_or_lbs'], axis=1, inplace=True)

wingman_df['power_units'].fillna(wingman_df['power_units'].median(), inplace=True)

In [59]:
temp = wingman_df.copy()

# Set the index for temp as 'id'
temp.set_index('id', inplace=True)

## Testing encoders before integrating into preprocessor.py

### temp DF Info Monitor

In [60]:
temp.shape

(16722, 29)

In [61]:
temp.nunique()

eventsoe_no              85
far_part                 13
acft_make              3002
acft_model             3840
fixed_retractable         2
acft_category             8
homebuilt                 2
flight_hours_mean      6793
crew_category             5
power_units             421
dprt_time              1095
cert_max_gr_wt         1413
afm_hrs                9621
total_seats             141
num_eng                   6
type_last_insp            6
second_pilot              2
site_seeing               2
air_medical               2
crew_sex                  2
certs_held                2
dprt_apt_id            5423
dest_apt_id            5049
flt_plan_filed            7
pc_profession             3
eng_type                 11
carb_fuel_injection       3
type_fly                 24
eng_mfgr                662
dtype: int64

In [62]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16722 entries, 20080107X000261 to 202303241069471
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   eventsoe_no          16722 non-null  int64  
 1   far_part             16722 non-null  object 
 2   acft_make            16722 non-null  object 
 3   acft_model           16722 non-null  object 
 4   fixed_retractable    16722 non-null  object 
 5   acft_category        16722 non-null  object 
 6   homebuilt            16722 non-null  object 
 7   flight_hours_mean    16722 non-null  float64
 8   crew_category        16722 non-null  object 
 9   power_units          16722 non-null  float64
 10  dprt_time            16722 non-null  int64  
 11  cert_max_gr_wt       16722 non-null  float64
 12  afm_hrs              16722 non-null  float64
 13  total_seats          16722 non-null  int64  
 14  num_eng              16722 non-null  int64  
 15  type_last_insp   

In [63]:
temp.apply(lambda x: x.isin(['Y', 'N']).value_counts()).T.stack()

eventsoe_no          False    16722.0
far_part             False    16722.0
acft_make            False    16722.0
acft_model           False    16722.0
fixed_retractable    False    16722.0
acft_category        False    16722.0
homebuilt            True     16722.0
flight_hours_mean    False    16722.0
crew_category        False    16722.0
power_units          False    16722.0
dprt_time            False    16722.0
cert_max_gr_wt       False    16722.0
afm_hrs              False    16722.0
total_seats          False    16722.0
num_eng              False    16722.0
type_last_insp       False    16722.0
second_pilot         True     16722.0
site_seeing          True     16722.0
air_medical          True     16722.0
crew_sex             False    16722.0
certs_held           True     16722.0
dprt_apt_id          False    16722.0
dest_apt_id          False    16722.0
flt_plan_filed       False    16722.0
pc_profession        False    16722.0
eng_type             False    16722.0
carb_fuel_in

### Testing

##### transform_yes_no

In [64]:
def transform_yes_no(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms the 'yes' and 'no' values to 1 and 0 respectively."""

    yn_categories = ["N", "Y"]
    bin_encoder_1 = OrdinalEncoder(categories=[yn_categories])


    return pd.DataFrame(bin_encoder_1.fit_transform(X), columns=X.columns)

t = transform_yes_no(temp[['air_medical']])
t

Unnamed: 0,air_medical
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
16717,0.0
16718,0.0
16719,0.0
16720,0.0


##### transform_gender

##### transform_type_insp

##### transform_type_fly

In [65]:
def transform_type_fly(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms type_fly using Custom function."""

    wingman_data_enc = X
    
    top_9_categories = wingman_data_enc['type_fly'].value_counts().nlargest(9).index.tolist()
    
    wingman_data_enc[''] = np.where(wingman_data_enc['type_fly'].isin(top_9_categories), wingman_data_enc['type_fly'], 'Other')
    
    type_fly_encoded = pd.get_dummies(wingman_data_enc, columns=[''], dtype=int)
    type_fly_encoded = type_fly_encoded.drop(columns = ['type_fly'])

    return type_fly_encoded

t = transform_type_fly(temp[['type_fly']])
t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wingman_data_enc[''] = np.where(wingman_data_enc['type_fly'].isin(top_9_categories), wingman_data_enc['type_fly'], 'Other')


Unnamed: 0_level_0,_AAPL,_AOBV,_BUS,_FLTS,_INST,_OWRK,_Other,_PERS,_POSI,_UNK
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20080107X000261,0,0,0,0,0,0,0,1,0,0
20080107X000262,0,0,0,0,0,0,0,1,0,0
20080109X000361,0,0,0,0,0,0,0,1,0,0
20080107X000271,0,0,0,0,0,0,0,1,0,0
20080115X000511,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
202303131068811,0,0,0,0,0,0,0,1,0,0
202303131068821,0,0,0,0,0,0,0,1,0,0
202303131068841,0,0,0,0,0,0,0,1,0,0
202303131068851,1,0,0,0,0,0,0,0,0,0


##### transform_eng_mfgr

##### transform_far_part

In [66]:
def transform_far_part(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms far_part using OHE."""

    ohe_far_part = OneHotEncoder(sparse_output=False, min_frequency=300).fit(X)
    far_part_encoded = ohe_far_part.transform(X)
    
    far_part_encoded_df = pd.DataFrame(far_part_encoded, columns=ohe_far_part.get_feature_names_out())
    far_part_encoded_df.index = X.index
    
    return far_part_encoded_df

t = transform_far_part(temp[['far_part']])
t

Unnamed: 0_level_0,far_part_091,far_part_135,far_part_137,far_part_infrequent_sklearn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20080107X000261,1.0,0.0,0.0,0.0
20080107X000262,1.0,0.0,0.0,0.0
20080109X000361,1.0,0.0,0.0,0.0
20080107X000271,1.0,0.0,0.0,0.0
20080115X000511,1.0,0.0,0.0,0.0
...,...,...,...,...
202303131068811,1.0,0.0,0.0,0.0
202303131068821,1.0,0.0,0.0,0.0
202303131068841,1.0,0.0,0.0,0.0
202303131068851,1.0,0.0,0.0,0.0


##### transform_acft_make

##### transform_fixed_retractable

##### transform_acft_category

##### transform_homebuilt

##### transform_crew_category

In [67]:
def transform_crew_category(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms crew_category using Custom functions and OHE."""

    X = X.replace({'KPLT':'PLT', 'CPLT':'PLT'})
    
    ohe_crew_cat = OneHotEncoder(sparse_output=False).fit(X)
    crew_cat_enc = ohe_crew_cat.transform(X)
    
    crew_cat_enc_df = pd.DataFrame(crew_cat_enc, columns=ohe_crew_cat.get_feature_names_out())
    crew_cat_enc_df.index = X.index
    
    return crew_cat_enc_df

t = transform_crew_category(temp[['crew_category']])
t

Unnamed: 0_level_0,crew_category_DSTU,crew_category_FLTI,crew_category_PLT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20080107X000261,0.0,0.0,1.0
20080107X000262,0.0,0.0,1.0
20080109X000361,0.0,0.0,1.0
20080107X000271,0.0,0.0,1.0
20080115X000511,1.0,0.0,0.0
...,...,...,...
202303131068811,0.0,0.0,1.0
202303131068821,0.0,0.0,1.0
202303131068841,0.0,0.0,1.0
202303131068851,0.0,0.0,1.0


##### transform_eng_type

##### transform_carb_fuel_injection

##### transform_dprt_dest_apt_id

In [87]:
def transform_dest_apt_id(X: pd.DataFrame, field: str) -> pd.DataFrame:
    """Transforms certs_held using Custom functions."""

    # X[field] = X[field].where(X[field] != 'NONE', 0)
    # X[field] = X[field].where(X[field] != 'PVT', 0)
    # X[field] = X[field].where(X[field] == 0, 1)
    
    # condition_11 = X[field] == 'NONE' 
    # condition_12 = X[field] == 'PVT' 
    condition_1 = (X[field] == 'NONE') | (X[field] == 'PVT')
    
    X.loc[condition_1, field] = 0
    
    condition_2 = (X[field] != 0) 
    
    X.loc[condition_2, field] = 1

    
    # X[field] = X[field].where(X[field] != 'NONE', 0)
    # X[field] = X[field].where(X[field] != 'PVT', 0)
    # X[field] = X[field].where(X[field] == 0, 1)

    return X

t = transform_dest_apt_id(temp[['dest_apt_id']], 'dest_apt_id')

t.value_counts()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[condition_1, field] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[condition_2, field] = 1


dest_apt_id
1              13383
0               3339
dtype: int64

In [91]:
def transform_pc_profession(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms pc_profession using Custom functions."""

    # X['pc_profession'].replace('UNK', 'No', inplace=True)
    # X['pc_profession'].replace(['Yes', 'No'], [1, 0], inplace=True)
    X['pc_profession'] = X['pc_profession'].replace('UNK', 'No')
    X['pc_profession'] = X['pc_profession'].replace(['Yes', 'No'], [1, 0])

    return X
    
t1 = transform_pc_profession(temp[['pc_profession']])
t1.value_counts()
t1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['pc_profession'] = X['pc_profession'].replace('UNK', 'No')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['pc_profession'] = X['pc_profession'].replace(['Yes', 'No'], [1, 0])


Unnamed: 0_level_0,pc_profession
id,Unnamed: 1_level_1
20080107X000261,0
20080107X000262,0
20080109X000361,0
20080107X000271,0
20080115X000511,0


In [74]:
def encode_column(df, column_name):
    
    # One-hot encode the column
    encoded_dataframe = pd.get_dummies(df[column_name])
    
    # Combine 'PVT' and 'NONE' into a single column and drop the original columns
    encoded_dataframe['PVT_NONE'] = encoded_dataframe['PVT'] | encoded_dataframe['NONE']
    encoded_dataframe.drop(['PVT', 'NONE'], axis=1, inplace=True)
    
    # Create a column for other categories
    encoded_dataframe['OTHER'] = 1 - encoded_dataframe['PVT_NONE']
    
    return encoded_dataframe[['PVT_NONE', 'OTHER']]

t = encode_column(temp[['dest_apt_id']], 'dest_apt_id')

t.value_counts()

PVT_NONE  OTHER
0         1        13383
1         0         3339
dtype: int64

In [75]:
temp['dprt_apt_id'].value_counts()

NONE    2247
PVT      183
DVT       52
FFZ       45
VGT       40
        ... 
MU27       1
18OR       1
MNF        1
I93        1
ATW        1
Name: dprt_apt_id, Length: 5423, dtype: int64

##### transform_flt_plan_filed

In [68]:
def transform_flt_filed(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms flt_plan_filed using Custom functions and OHE."""

    X.replace('UNK', 'NONE', inplace=True)
    X.replace('VFIF', 'IFR', inplace=True)
    X.replace(['CVFR', 'MVFR'], 'VFR', inplace=True)

    ohe = OneHotEncoder(sparse_output=False).fit(X)

    ohe_df = pd.DataFrame(ohe.transform(X), columns=ohe.get_feature_names_out())

    return ohe_df

t = transform_flt_filed(temp[['flt_plan_filed']])
t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('UNK', 'NONE', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('VFIF', 'IFR', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace(['CVFR', 'MVFR'], 'VFR', inplace=True)


Unnamed: 0,flt_plan_filed_IFR,flt_plan_filed_NONE,flt_plan_filed_VFR
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
16717,0.0,1.0,0.0
16718,1.0,0.0,0.0
16719,0.0,1.0,0.0
16720,0.0,1.0,0.0


##### transform_pc_professional

In [None]:
def transform_pc_profession(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms pc_profession using Custom functions."""

    X['pc_profession'].replace('UNK', 'No', inplace=True)
    X['pc_profession'].replace(['Yes', 'No'], [1, 0], inplace=True)

    return X

In [69]:
def transform_pc_profession(X: pd.DataFrame) -> pd.DataFrame:
    """Transforms pc_profession using Custom functions."""

    X.replace('UNK', 'No', inplace=True)
    X.replace(['Yes', 'No'], [1, 0], inplace=True)
    
    ohe = OneHotEncoder(sparse_output=False).fit(X)

    ohe_df = pd.DataFrame(ohe.transform(X), columns=ohe.get_feature_names_out())

    return ohe_df

t = transform_pc_profession(temp[['flt_plan_filed']])
t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('UNK', 'No', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace(['Yes', 'No'], [1, 0], inplace=True)


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']