In [46]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

### Importing Data & Viewing


---



In [2]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/data/wingman/wingman_data_cleaned.csv')

In [3]:
df

Unnamed: 0,id,eventsoe_no,far_part,acft_make,acft_model,fixed_retractable,acft_category,homebuilt,flight_hours_mean,crew_category,...,crew_sex,certs_held,dprt_apt_id,dest_apt_id,flt_plan_filed,pc_profession,eng_type,carb_fuel_injection,type_fly,eng_mfgr
0,20080107X000261,250,091,PIPER,PA 28-180,FIXD,AIR,N,705.000000,PLT,...,M,Y,O69,OQ3,NONE,No,REC,CARB,PERS,Textron Lycoming
1,20080107X000262,250,091,Barnard/Stancil,Glastar,FIXD,AIR,Y,8300.000000,PLT,...,M,Y,OQ3,O70,NONE,No,REC,CARB,PERS,Textron Lycoming
2,20080109X000361,230,091,Micco Aircraft Company,MAC-145B,RETR,AIR,N,1000.000000,PLT,...,F,Y,T20,AXH,NONE,No,REC,FINJ,PERS,Lycoming
3,20080107X000271,81,091,Pilatus,PC-12/45,RETR,AIR,N,10571.000000,PLT,...,M,Y,PHX,PWA,IFR,UNK,TP,UNK,PERS,Pratt & Whitney Canada
4,20080115X000511,240,091,Cessna,152,FIXD,AIR,N,18.000000,DSTU,...,M,Y,KFNL,KFNL,NONE,No,REC,CARB,INST,Lycoming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18802,202303131068811,901,091,BEECH,E-55,RETR,AIR,N,10235.000000,PLT,...,M,Y,2CO5,PVT,NONE,No,REC,CARB,PERS,Lycoming
18803,202303131068821,470,091,TRENDAK,TAURUS,FIXD,GYRO,Y,550.299988,PLT,...,M,Y,ATW,LOT,IFR,No,REC,FINJ,PERS,Continental
18804,202303131068841,96,091,MAULE,M-7-235B,FIXD,AIR,N,632.000000,PLT,...,M,Y,3L2,KVGT,NONE,Yes,REC,CARB,PERS,Titan
18805,202303131068851,191,091,BEECH,F33A,RETR,AIR,N,1655.000000,PLT,...,M,Y,NONE,NONE,NONE,Yes,TS,UNK,AAPL,ALLISON


In [12]:
df = df[['type_last_insp', 'second_pilot','site_seeing', 'air_medical', 'crew_sex']]

In [13]:
df

Unnamed: 0,type_last_insp,second_pilot,site_seeing,air_medical,crew_sex
0,ANNL,N,N,N,M
1,COAW,N,N,N,M
2,ANNL,N,N,N,F
3,UNK,N,N,N,M
4,ANNL,N,N,N,M
...,...,...,...,...,...
18802,ANNL,N,N,N,M
18803,ANNL,N,N,N,M
18804,COND,N,N,N,M
18805,ANNL,N,N,N,M


## Exploring Categories Of Fields

In [15]:
df['type_last_insp'].value_counts()

ANNL    11814
100H     2586
COND     2039
UNK       948
COAW      837
AAIP      583
Name: type_last_insp, dtype: int64

In [24]:
df['second_pilot'].value_counts()

N    15633
Y     3174
Name: second_pilot, dtype: int64

In [17]:
df['site_seeing'].value_counts()

N    18656
Y      151
Name: site_seeing, dtype: int64

In [18]:
df['air_medical'].value_counts()

N    18650
Y      157
Name: air_medical, dtype: int64

In [19]:
df['crew_sex'].value_counts()

M    18243
F      564
Name: crew_sex, dtype: int64

## Yes/No Binary Encoding

In [43]:
yn_categories = ["N", "Y"]
bin_encoder_1 = OrdinalEncoder(categories=[yn_categories])

In [37]:
bin_encoder_1.fit(df[['second_pilot']])

In [39]:
bin_encoder_1.transform(df[['second_pilot']])

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [41]:
df[['second_pilot']].to_numpy()

array([['N'],
       ['N'],
       ['N'],
       ...,
       ['N'],
       ['N'],
       ['N']], dtype=object)

## Gender Binary Encoding

In [44]:
mf_categories = ["M", "F"]
bin_encoder_2 = OrdinalEncoder(categories=[mf_categories])

In [38]:
bin_encoder_2.fit(df[['crew_sex']])

In [40]:
bin_encoder_2.transform(df[['crew_sex']])

array([[0.],
       [0.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [42]:
df[['crew_sex']].to_numpy()

array([['M'],
       ['M'],
       ['F'],
       ...,
       ['M'],
       ['M'],
       ['M']], dtype=object)

## One Hot Encoding

In [55]:
ohe = OneHotEncoder(sparse=False, drop='if_binary')
ohe.fit(df[['type_last_insp']])



In [48]:
ohe.transform(df[['type_last_insp']])

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

In [49]:
ohe.get_feature_names_out()

array(['type_last_insp_100H', 'type_last_insp_AAIP',
       'type_last_insp_ANNL', 'type_last_insp_COAW',
       'type_last_insp_COND', 'type_last_insp_UNK'], dtype=object)

## Final preproc code

In [50]:
df.columns

Index(['type_last_insp', 'second_pilot', 'site_seeing', 'air_medical',
       'crew_sex'],
      dtype='object')

In [None]:
# yes/no ordinal encoding

yn_categories = ["N", "Y"]
bin_encoder_1 = OrdinalEncoder(categories=[yn_categories])

bin_encoder_1.fit(df[['second_pilot']])

# transforming yes/no fields
df['second_pilot'] = bin_encoder_1.transform(df[['second_pilot']])
df['site_seeing'] = bin_encoder_1.transform(df[['site_seeing']])
df['air_medical'] = bin_encoder_1.transform(df[['air_medical']])

# gender ordinal encoding

mf_categories = ["M", "F"]
bin_encoder_2 = OrdinalEncoder(categories=[mf_categories])

bin_encoder_2.fit(df[['crew_sex']])

# transforming gender fields
df['crew_sex'] = bin_encoder_2.transform(df[['crew_sex']])

# one hot encoding
ohe = OneHotEncoder(sparse=False, drop='if_binary')
ohe.fit(df[['type_last_insp']])

df[ohe.get_feature_names_out()] = ohe.transform(df[['type_last_insp']])