## Imports

In [27]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.utils import compute_class_weight

import sys

sys.path.insert(0, '../ml_logic')

from preprocessor import *
from data import *

In [28]:
# Import data from CSV (temp)
wingman_data = pd.read_csv("../raw_data/trimmed_data/wingman_data_v5.csv")

In [29]:
target_columns_v1 = ['phase_no', 'eventsoe_no']
target_columns_v2 = ['category_no', 'subcategory_no', 'section_no', 'subsection_no', 'modifier_no']
target_columns_v3 = ['category_no']
target_columns_v4 = ['eventsoe_no']
target_columns_v5 = ['subcategory_no']

In [30]:
# Clean
wingman_data_clean = clean_data(wingman_data)
# Preprocess
wingman_data_proc = preprocess_features(wingman_data_clean, target_columns_v5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wingman_data_cleaned.drop([
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wingman_data_enc[''] = np.where(wingman_data_enc['type_fly'].isin(top_9_categories), wingman_data_enc['type_fly'], 'Other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_mfgr['eng_mfgr'] = eng_mfgr['eng_mfgr'].str.upper()
A value is trying to be set on a copy of a slice from 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[field] = X[field].apply(lambda x: 0 if x in ['NONE', 'PVT'] else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[field] = X[field].apply(lambda x: 0 if x in ['NONE', 'PVT'] else 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('UNK', 'NONE', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

## Train test split

In [31]:
X = wingman_data_proc.drop(columns=["subcategory_no"])
y = wingman_data_proc["subcategory_no"]

In [32]:
def calculate_class_weights(y):
    """
    Calculates the balanced class weights for a multiclass classification problem.
    Parameters:
    - y: array-like, shape (n_samples,)
        The target variable containing class labels.
    Returns:
    - class_weights: dict
        A dictionary containing the class weights for each class label.
    """
    
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
    return dict(zip(np.unique(y), class_weights))

y_class_weights = calculate_class_weights(y)
y_class_weights

{1: 22.071428571428573,
 2: 0.9146284467985668,
 3: 5.824404761904762,
 4: 4.765422077922078,
 5: 0.8847197106690777,
 6: 0.26702142174921545,
 7: 1.654268808114962}

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## Baseline model

In [34]:
baseline_mod = RandomForestClassifier(n_estimators=100, random_state=1)
baseline_mod.fit(X_train, y_train)

In [35]:
base = max(y.value_counts()/len(y))
print("Random Selection Accuracy: %.2f%%" % (base * 100.0))

Random Selection Accuracy: 53.50%


In [36]:
## baseline prediction

y_pred = baseline_mod.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Baseline Accuracy: %.2f%%" % (accuracy * 100.0))

Baseline Accuracy: 52.79%


In [37]:
model = RandomForestClassifier(n_jobs=-1, verbose=0, class_weight=y_class_weights)

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1250, num = 50)]
criterion = ['gini', 'log_loss']
max_depth = [15, 20, 25]
max_features = ['sqrt', 'log2']
bootstrap = [True, False]


random_grid = {
    'n_estimators': n_estimators,
    'criterion': criterion,
    'max_depth': max_depth,
    'max_features': max_features,
    'bootstrap': bootstrap
    }

In [17]:
from sklearn.model_selection import RandomizedSearchCV
model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 15, cv = 3, verbose=1, n_jobs = -1)
model_random.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits




In [38]:
model_random.best_params_, model_random.best_score_

({'n_estimators': 1015,
  'max_features': 'log2',
  'max_depth': 25,
  'criterion': 'log_loss',
  'bootstrap': True},
 0.5228267417548773)

In [39]:
y_pred = model_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Optimised Accuracy: %.2f%%" % (accuracy * 100.0))

Optimised Accuracy: 54.84%


## Model performance (not sure if this is needed for baseline)

In [40]:
# confusion matrix

confusion_matrix(y_test, y_pred)

array([[  0,   0,   0,   0,   0,   8,   0],
       [  0,  14,   0,   0,   4, 201,   1],
       [  0,   2,   0,   0,   2,  31,   0],
       [  0,   1,   0,   2,   3,  46,   0],
       [  0,   9,   0,   1,   6, 215,   0],
       [  0,  10,   0,   1,   8, 783,   1],
       [  0,   4,   0,   1,   1, 113,   0]])

In [41]:
# classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         8
           2       0.35      0.06      0.11       220
           3       0.00      0.00      0.00        35
           4       0.40      0.04      0.07        52
           5       0.25      0.03      0.05       231
           6       0.56      0.98      0.71       803
           7       0.00      0.00      0.00       119

    accuracy                           0.55      1468
   macro avg       0.22      0.16      0.13      1468
weighted avg       0.41      0.55      0.42      1468



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
# Feature importance

opt_model = RandomForestClassifier(**model_random.best_params_, n_jobs=-1, verbose=0, class_weight=y_class_weights)
opt_model.fit(X_train, y_train)

feature_imp = pd.Series(opt_model.feature_importances_, index = X.columns).sort_values(ascending = False).head(15)
feature_imp

flight_hours_mean                   0.100387
afm_hrs                             0.092251
dprt_time                           0.090917
cert_max_gr_wt                      0.087396
power_units                         0.079177
total_seats                         0.042099
fixed_retractable_RETR              0.024945
acft_category_infrequent_sklearn    0.022219
acft_category_AIR                   0.021244
acft_make_infrequent_sklearn        0.019376
type_last_insp_ANNL                 0.018793
eng_mfgr_LYCOMING                   0.018293
eng_mfgr_CONTINENTAL                0.018221
pc_profession                       0.017610
carb_fuel_injection_CARB            0.017124
dtype: float64