## Imports

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder


## Encoding type fly

In [6]:
wingman_data_cleaned = pd.read_csv('raw_data/wingman_data_cleaned.csv')
wingman_data_cleaned.dtypes

id                      object
eventsoe_no              int64
far_part                object
acft_make               object
acft_model              object
fixed_retractable       object
acft_category           object
homebuilt               object
flight_hours_mean      float64
crew_category           object
power_units            float64
hp_or_lbs               object
dprt_time                int64
cert_max_gr_wt         float64
afm_hrs                float64
total_seats              int64
num_eng                  int64
type_last_insp          object
second_pilot            object
site_seeing             object
air_medical             object
crew_sex                object
certs_held              object
dprt_apt_id             object
dest_apt_id             object
flt_plan_filed          object
pc_profession           object
eng_type                object
carb_fuel_injection     object
type_fly                object
eng_mfgr                object
dtype: object

In [7]:
wingman_data_enc = wingman_data_cleaned[['type_fly', 'eng_mfgr']]
wingman_data_enc['type_fly'].value_counts()

type_fly
PERS    11912
INST     2572
UNK      1090
AAPL      916
POSI      394
BUS       355
OWRK      247
AOBV      230
FLTS      224
SKYD       95
FERY       93
EXEC       84
EXLD       83
BUS        79
ASHO       73
PUBF       69
BANT       67
PUBL       63
PUBS       49
PUBU       34
GLDT       32
FIRF       21
UNK        19
ADRP        6
Name: count, dtype: int64

In [8]:
top_9_categories = wingman_data_enc['type_fly'].value_counts().nlargest(9).index.tolist()
wingman_data_enc[''] = np.where(wingman_data_enc['type_fly'].isin(top_9_categories), wingman_data_enc['type_fly'], 'Other')
type_fly_encoded = pd.get_dummies(wingman_data_enc, columns=[''], dtype=int)
type_fly_encoded = type_fly_encoded.drop(columns = ['type_fly', 'eng_mfgr'])
type_fly_encoded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wingman_data_enc[''] = np.where(wingman_data_enc['type_fly'].isin(top_9_categories), wingman_data_enc['type_fly'], 'Other')


Unnamed: 0,_AAPL,_AOBV,_BUS,_FLTS,_INST,_OWRK,_Other,_PERS,_POSI,_UNK
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
18802,0,0,0,0,0,0,0,1,0,0
18803,0,0,0,0,0,0,0,1,0,0
18804,0,0,0,0,0,0,0,1,0,0
18805,1,0,0,0,0,0,0,0,0,0


## Encoding eng_mfgr

In [9]:
eng_mfgr = wingman_data_cleaned['eng_mfgr']
eng_mfgr = pd.DataFrame(eng_mfgr)

eng_mfgr.value_counts().head(25) 

eng_mfgr                   
LYCOMING                       3894
Lycoming                       3682
CONT MOTOR                     2242
Continental                    1923
Rotax                           878
Pratt & Whitney                 495
ROTAX                           376
Continental Motors              326
P&W                             269
Rolls Royce                     214
P&W CANADA                      167
Teledyne Continental Motors     135
Pratt & Whitney Canada          135
Rolls-Royce                     129
Honeywell                       128
Pratt and Whitney               122
Allison                         118
ALLISON                         111
Other                           111
Continental                     111
CONTINENTAL                     104
Jabiru                           99
Teledyne Continental             99
Turbomeca                        90
FRANKLIN                         81
Name: count, dtype: int64

In [10]:
eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].str.upper()
eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].str.strip()


In [11]:
eng_mfgr['eng_mfgr'].value_counts().head(25)

eng_mfgr
LYCOMING                       7651
CONT MOTOR                     2248
CONTINENTAL                    2143
ROTAX                          1261
PRATT & WHITNEY                 507
CONTINENTAL MOTORS              359
P&W                             270
ALLISON                         230
ROLLS ROYCE                     219
P&W CANADA                      204
ROLLS-ROYCE                     151
TURBOMECA                       150
FRANKLIN                        147
PRATT & WHITNEY CANADA          146
HONEYWELL                       144
PRATT AND WHITNEY               139
TELEDYNE CONTINENTAL MOTORS     138
JABIRU                          136
OTHER                           111
TELEDYNE CONTINENTAL            105
JACOBS                           81
GARRETT                          77
SUBARU                           77
GENERAL ELECTRIC                 74
GE                               71
Name: count, dtype: int64

In [12]:
eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].astype("category")
print(eng_mfgr['eng_mfgr'].cat.categories)
mapping = {"CONT MOTOR": "CONTINENTAL", "CONTINENTAL MOTORS": "CONTINENTAL",
           "PRATT & WHITNEY": "P&W", "P&W CANADA":"P&W", "PRATT & WHITNEY CANADA":"P&W",
           "PRATT AND WHITNEY": "P&W", "ROLLS-ROYCE": "ROLLS ROYCE", "TELEDYNE CONTINENTAL MOTORS": "TELEDYNE CONTINENTAL",
           "GE": "GENERAL ELECTRIC", "ROLLS-ROYC": "ROLLS ROYCE"}
eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].replace(mapping)
eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].cat.remove_unused_categories()

Index(['4 BURNER TYPE ENGINES', 'ADOLPHSON', 'AERO CONVERSIONS INC',
       'AERO ENGINES', 'AERO MOMENTUM', 'AERO SPORT', 'AERO SPORT POWER',
       'AERO SPORT POWER (LYCOMING)', 'AERO SPORT POWER LTD',
       'AERO SPORT POWER LTD.',
       ...
       'WRIGHT-HISPANO', 'WSK-PZL', 'WSK-PZL-K', 'WYNN CORVAIR', 'XTREEM 360',
       'YAMAHA', 'YUNEEC', 'ZANZOTTERA', 'ZENOAH', 'ZEONOA'],
      dtype='object', length=574)


In [13]:
eng_mfgr['eng_mfgr'].value_counts().head(15)

eng_mfgr
LYCOMING                7651
CONTINENTAL             4750
P&W                     1266
ROTAX                   1261
ROLLS ROYCE              431
TELEDYNE CONTINENTAL     243
ALLISON                  230
TURBOMECA                150
FRANKLIN                 147
GENERAL ELECTRIC         145
HONEYWELL                144
JABIRU                   136
OTHER                    111
JACOBS                    81
SUBARU                    77
Name: count, dtype: int64

In [14]:
def get_categories_above_percentage(column, percentage):
    total_count = len(column)
    value_counts = column.value_counts(normalize=True)
    cumulative_percentage = 0
    selected_categories = []
    for category, count in value_counts.items():
        cumulative_percentage += count
        if cumulative_percentage <= percentage:
            selected_categories.append(category)
        else:
            break
    return selected_categories

get_categories_above_percentage(eng_mfgr, 0.8)


[('LYCOMING',), ('CONTINENTAL',), ('P&W',), ('ROTAX',)]

In [15]:
def general_encoder(X, feature: str, min_frequency=None, max_categories=None) -> np.array:
    ohe = OneHotEncoder(sparse_output=False, min_frequency=min_frequency, max_categories=max_categories).fit(X[[feature]])
    feature_encoded = ohe.transform(X[[feature]])
    return feature_encoded

c = ['MFGR_LYCOMING', 'MFGR_CONTINENTAL', 'MFGR_P&W', 'MFGR_ROTAX', 'MFGR_ROLLS_ROYCE', 'MFGR_TELEDYNE_CONTINENTAL', 'MFGR_ALLISON', 'MFGR_TURBOMECA', 'MFGR_FRANKLIN', 
     'MFGR_GENERAL_ELECTRIC', 'MFGR_HONEYWELL', 'MFGR_JABIRU', 'MFGR_OTHER', 'MFGR_OTHER_MAKES']

eng_mfgr_enc = general_encoder(eng_mfgr, 'eng_mfgr', min_frequency=100)
eng_mfgr_enc = pd.DataFrame(eng_mfgr_enc, columns=c)

eng_mfgr_enc

Unnamed: 0,MFGR_LYCOMING,MFGR_CONTINENTAL,MFGR_P&W,MFGR_ROTAX,MFGR_ROLLS_ROYCE,MFGR_TELEDYNE_CONTINENTAL,MFGR_ALLISON,MFGR_TURBOMECA,MFGR_FRANKLIN,MFGR_GENERAL_ELECTRIC,MFGR_HONEYWELL,MFGR_JABIRU,MFGR_OTHER,MFGR_OTHER_MAKES
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18802,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18803,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18805,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## All steps

In [17]:

def enc_type_fly_eng_mfgr(X):

    ## encoding type_fly
    wingman_data_enc = X[['type_fly', 'eng_mfgr']]
    top_9_categories = wingman_data_enc['type_fly'].value_counts().nlargest(9).index.tolist()
    wingman_data_enc[''] = np.where(wingman_data_enc['type_fly'].isin(top_9_categories), wingman_data_enc['type_fly'], 'Other')
    type_fly_encoded = pd.get_dummies(wingman_data_enc, columns=[''], dtype=int)
    type_fly_encoded = type_fly_encoded.drop(columns = ['type_fly', 'eng_mfgr'])

    ## encoding eng_mfgr
    eng_mfgr = wingman_data_cleaned['eng_mfgr']
    eng_mfgr = pd.DataFrame(eng_mfgr)
    eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].str.upper()
    eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].str.strip()
    eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].astype("category")
    mapping = {"CONT MOTOR": "CONTINENTAL", "CONTINENTAL MOTORS": "CONTINENTAL",
           "PRATT & WHITNEY": "P&W", "P&W CANADA":"P&W", "PRATT & WHITNEY CANADA":"P&W",
           "PRATT AND WHITNEY": "P&W", "ROLLS-ROYCE": "ROLLS ROYCE", "TELEDYNE CONTINENTAL MOTORS": "TELEDYNE CONTINENTAL",
           "GE": "GENERAL ELECTRIC", "ROLLS-ROYC": "ROLLS ROYCE"}
    eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].replace(mapping)
    eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].cat.remove_unused_categories()
    c  = ['MFGR_LYCOMING', 'MFGR_CONTINENTAL', 'MFGR_P&W', 'MFGR_ROTAX', 'MFGR_ROLLS_ROYCE', 'MFGR_TELEDYNE_CONTINENTAL', 'MFGR_ALLISON', 'MFGR_TURBOMECA', 'MFGR_FRANKLIN', 
     'MFGR_GENERAL_ELECTRIC', 'MFGR_HONEYWELL', 'MFGR_JABIRU', 'MFGR_OTHER', 'MFGR_OTHER_MAKES']
    eng_mfgr_enc = general_encoder(eng_mfgr, 'eng_mfgr', min_frequency=100)
    eng_mfgr_enc = pd.DataFrame(eng_mfgr_enc, columns=c)
    return type_fly_encoded, eng_mfgr_enc