In [33]:
import json
from pathlib import Path

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split

In [52]:
# Dataset
PROJECT_DIR = Path.cwd().parent

parq_path = PROJECT_DIR / "data" / "all_stats.parquet"
df = pd.read_parquet(parq_path)
df.head()


Unnamed: 0,flight_id,before_after,date_diff,flight_length,label,num_flights_before,volt1_mean,volt1_std,volt1_min,volt1_max,...,NormAc_p25,NormAc_p75,AltMSL_mean,AltMSL_std,AltMSL_min,AltMSL_max,AltMSL_range,AltMSL_p25,AltMSL_p75,AltMSL_rate_mean
0,1,before,-1,4723.0,intake gasket leak/damage,-1,28.780796,0.04712894,28.7,29.3,...,-0.01,0.01,2764.796803,1616.009723,835.6,5162.4,4326.8,842.55,4048.9,0.000275
1,2,before,-2,4649.0,intake gasket leak/damage,-2,27.831276,0.7633934,25.0,28.2,...,-0.03,0.04,2205.734739,1287.575722,824.1,5261.6,4437.5,887.0,3509.8,-0.001678
2,3,same,0,40.0,intake gasket leak/damage,0,24.9,3.600403e-15,24.9,24.9,...,0.0,0.0,,,,,,,,
3,4,before,0,14.0,intake gasket leak/damage,0,25.4,3.697782e-15,25.4,25.4,...,0.0,0.0,,,,,,,,
4,5,same,0,683.0,intake gasket leak/damage,0,26.945095,1.271312,24.6,28.1,...,-0.02,0.02,832.528006,5.235779,812.9,846.0,33.1,831.6,835.4,0.031571


In [53]:
# add cluster (target) column
json_path = PROJECT_DIR / "data" / "label_cluster_map.json"
assert json_path.exists()

with json_path.open() as fp:
    label_cluster_map = json.loads(fp.read())

df["cluster"] = df["label"].map(label_cluster_map)

In [60]:
# limit labels
LABELS = [
    "baffle crack/damage/loose/miss",
    "cylinder compression issue",
    "engine failure/fire/time out",
    "intake gasket leak/damage",
]

label_mask = df["label"].isin(LABELS)
before_mask = df["before_after"] == "before"
df_before = df[label_mask]

In [61]:
# Choose features
features = [
    "AltMSL_max",
    "AltMSL_mean",
    "AltMSL_min",
    "AltMSL_p25",
    "AltMSL_p75",
    "AltMSL_range",
    "AltMSL_rate_mean",
    "AltMSL_std",
    # "E1_CHT1_max",
    # "E1_CHT1_mean",
    # "E1_CHT1_min",
    # "E1_CHT1_p25",
    # "E1_CHT1_p75",
    # "E1_CHT1_std",
    # "E1_CHT2_max",
    # "E1_CHT2_mean",
    # "E1_CHT2_min",
    # "E1_CHT2_p25",
    # "E1_CHT2_p75",
    # "E1_CHT2_std",
    "E1_CHT3_max",
    "E1_CHT3_mean",
    "E1_CHT3_min",
    "E1_CHT3_p25",
    "E1_CHT3_p75",
    "E1_CHT3_std",
    # "E1_CHT4_max",
    # "E1_CHT4_mean",
    # "E1_CHT4_min",
    # "E1_CHT4_p25",
    # "E1_CHT4_p75",
    # "E1_CHT4_std",
    "E1_CHT_max_all",
    "E1_CHT_mean_all",
    "E1_CHT_min_all",
    "E1_CHT_rate_max",
    "E1_CHT_spread_mean",
    "E1_CHT_std_all",
    # "E1_EGT1_max",
    # "E1_EGT1_mean",
    # "E1_EGT1_min",
    # "E1_EGT1_p25",
    # "E1_EGT1_p75",
    # "E1_EGT1_std",
    # "E1_EGT2_max",
    # "E1_EGT2_mean",
    # "E1_EGT2_min",
    # "E1_EGT2_p25",
    # "E1_EGT2_p75",
    # "E1_EGT2_std",
    # "E1_EGT3_max",
    # "E1_EGT3_mean",
    # "E1_EGT3_min",
    # "E1_EGT3_p25",
    # "E1_EGT3_p75",
    # "E1_EGT3_std",
    "E1_EGT4_max",
    "E1_EGT4_mean",
    "E1_EGT4_min",
    "E1_EGT4_p25",
    "E1_EGT4_p75",
    "E1_EGT4_std",
    "E1_EGT_max_all",
    "E1_EGT_mean_all",
    "E1_EGT_min_all",
    "E1_EGT_rate_max",
    "E1_EGT_spread_mean",
    "E1_EGT_std_all",
    "E1_FFlow_max",
    "E1_FFlow_mean",
    "E1_FFlow_min",
    "E1_FFlow_p25",
    "E1_FFlow_p75",
    "E1_FFlow_std",
    "E1_OilP_mean",
    "E1_OilP_min",
    "E1_OilP_p25",
    "E1_OilP_p75",
    "E1_OilP_std",
    "E1_OilT_max",
    "E1_OilT_mean",
    "E1_OilT_min",
    "E1_OilT_p25",
    "E1_OilT_p75",
    "E1_OilT_rate",
    "E1_OilT_std",
    "E1_RPM_max",
    "E1_RPM_mean",
    "E1_RPM_min",
    "E1_RPM_p25",
    "E1_RPM_p75",
    "E1_RPM_std",
    # "FQtyL_consumed",
    # "FQtyL_end",
    # "FQtyL_max",
    # "FQtyL_mean",
    # "FQtyL_min",
    # "FQtyL_p25",
    # "FQtyL_p75",
    # "FQtyL_rate",
    # "FQtyL_start",
    # "FQtyL_std",
    # "FQtyR_consumed",
    # "FQtyR_end",
    # "FQtyR_max",
    # "FQtyR_mean",
    # "FQtyR_min",
    # "FQtyR_p25",
    # "FQtyR_p75",
    # "FQtyR_rate",
    # "FQtyR_start",
    # "FQtyR_std",
    # "IAS_max",
    # "IAS_mean",
    # "IAS_min",
    # "IAS_p25",
    # "IAS_p75",
    # "IAS_rate_mean",
    # "IAS_std",
    # "NormAc_max",
    # "NormAc_mean",
    # "NormAc_min",
    # "NormAc_p25",
    # "NormAc_p75",
    # "NormAc_std",
    # "OAT_max",
    # "OAT_mean",
    # "OAT_min",
    # "OAT_p25",
    # "OAT_p75",
    # "OAT_std",
    # "VSpd_max",
    # "VSpd_mean",
    # "VSpd_min",
    # "VSpd_p25",
    # "VSpd_p75",
    # "VSpd_std",
    "amp1_max",
    "amp1_mean",
    "amp1_min",
    "amp1_p25",
    "amp1_p75",
    "amp1_std",
    "amp2_max",
    "amp2_mean",
    "amp2_min",
    "amp2_p25",
    "amp2_p75",
    "amp2_std",
    "date_diff",
    "flight_length",
    "fuel_imbalance_max",
    "fuel_imbalance_mean",
    "num_flights_before",
    "volt1_max",
    "volt1_mean",
    "volt1_min",
    "volt1_p25",
    "volt1_p75",
    "volt1_std",
    # "volt2_max",
    # "volt2_mean",
    # "volt2_min",
    # "volt2_p25",
    # "volt2_p75",
    # "volt2_std",
]

In [62]:
# drop na
df_before = df_before.loc[~df_before[features].isna().any(axis=1), :]
# df_before = df_before[features].dropna()
df_before.head()

Unnamed: 0,flight_id,before_after,date_diff,flight_length,label,num_flights_before,volt1_mean,volt1_std,volt1_min,volt1_max,...,NormAc_p75,AltMSL_mean,AltMSL_std,AltMSL_min,AltMSL_max,AltMSL_range,AltMSL_p25,AltMSL_p75,AltMSL_rate_mean,cluster
1,2,before,-2,4649.0,intake gasket leak/damage,-2,27.831276,0.763393,25.0,28.2,...,0.04,2205.734739,1287.575722,824.1,5261.6,4437.5,887.0,3509.8,-0.001678,28
4,5,same,0,683.0,intake gasket leak/damage,0,26.945095,1.271312,24.6,28.1,...,0.02,832.528006,5.235779,812.9,846.0,33.1,831.6,835.4,0.031571,28
5,7,after,1,3482.0,intake gasket leak/damage,1,27.998334,0.560423,24.6,28.2,...,0.02,1896.346439,913.875079,833.8,3022.9,2189.1,842.925,2909.0,-0.000144,28
6,8,same,0,263.0,intake gasket leak/damage,0,27.723954,0.524219,25.9,28.1,...,0.02,827.215419,3.395479,816.1,842.8,26.7,825.0,828.75,0.090265,28
7,9,before,-1,4979.0,intake gasket leak/damage,-1,28.035569,0.284447,25.0,28.2,...,0.02,2343.803375,1164.073461,828.0,3938.8,3110.8,838.8,3320.275,0.000743,28


In [63]:
df_before.shape

(10336, 170)

In [67]:
# train-test split
test_size = 0.20
random_state = 1350

train_set, test_set = train_test_split(
    df_before, test_size=test_size, random_state=random_state
)

# X, y train
X_train = train_set.loc[:, features]
y_train = train_set["cluster"]

# X, y test
X_test = test_set.loc[:, features]
y_test = test_set["cluster"]

In [66]:
X_train.shape

(8268, 79)

In [68]:
X_test.isna().any().any()

np.False_

In [69]:
y_train.isna().any()

np.False_

In [70]:
# Class distribution in test data
for cluster in y_train.unique():
    count = y_train[y_train == cluster].size
    pct = count / len(y_train) * 100
    print(f"{cluster=}, {count=}, {pct=:.1f}%)")

cluster=np.int64(4), count=909, pct=11.0%)
cluster=np.int64(28), count=6556, pct=79.3%)
cluster=np.int64(21), count=430, pct=5.2%)
cluster=np.int64(13), count=373, pct=4.5%)


In [71]:
#  train
n_estimators = 500
max_leaf_nodes = 16
class_weight = "balanced"  # "balanced_subsample"

rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_leaf_nodes=max_leaf_nodes,
    class_weight=class_weight,
    n_jobs=-1,
)
rf_clf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,16
,min_impurity_decrease,0.0
,bootstrap,True


In [72]:
# Run Cross-Validation
scoring = "f1_weighted"
cv_results = cross_validate(rf_clf, X_test, y_test, cv=5, scoring=scoring)


scores = cv_results["test_score"]
print(scores)

[0.54908629 0.52129066 0.56227814 0.56850233 0.57616991]
