In [None]:
# Decision Tree Pipeline for Jet Engine Fault Detection
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.rcParams['figure.figsize'] = [19, 8]

DATA_PATH = 'engine_fault_detection_dataset.csv'
TARGET_COL = 'Engine_Condition'


In [None]:
# Load data
engine_df = pd.read_csv(DATA_PATH)
print("Loaded:", engine_df.shape)
engine_df.head()


In [None]:
# EDA: info, description, nulls
print(engine_df.info())
print(engine_df.describe().T)
print("\nNull counts:\n", engine_df.isnull().sum())


In [None]:
# EDA: class distribution
sns.countplot(data=engine_df, x=TARGET_COL)
plt.title("Class Distribution")
plt.show()


In [None]:
# EDA: correlations heatmap for numeric
num_df = engine_df.select_dtypes(include=np.number)
sns.heatmap(num_df.corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Preprocess: split, scaling optional
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

X = engine_df.drop(columns=[TARGET_COL])
y = engine_df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
preprocessor = ColumnTransformer([
  ("num", StandardScaler(), numeric_features)
], remainder='drop')

print("Train:", X_train.shape, "Test:", X_test.shape)


In [None]:
# Decision Tree training and evaluation (fast)
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Using class_weight to counter imbalance (DT supports it via sample weighting workaround not native);
# we'll provide sample_weight during fit if needed. Here we use min-cost-complexity pruning.

dt = DecisionTreeClassifier(
  criterion='gini',
  splitter='best',
  max_depth=None,
  min_samples_split=4,
  min_samples_leaf=2,
  ccp_alpha=0.0,
  random_state=42
)

dt_pipeline = Pipeline([
  ("preprocess", preprocessor),
  ("model", dt)
])

dt_pipeline.fit(X_train, y_train)
dt_pred = dt_pipeline.predict(X_test)
print("DT Accuracy:", accuracy_score(y_test, dt_pred))
print("DT Classification Report:\n", classification_report(y_test, dt_pred))
print("DT Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))


In [None]:
# Cross-validation and light tuning for Decision Tree
from sklearn.model_selection import GridSearchCV

param_grid = {
  'model__max_depth': [None, 6, 10, 16, 24],
  'model__min_samples_split': [2, 4, 8, 16],
  'model__min_samples_leaf': [1, 2, 4, 8],
  'model__ccp_alpha': [0.0, 0.0005, 0.001, 0.005]
}

gs = GridSearchCV(
  estimator=dt_pipeline,
  param_grid=param_grid,
  scoring='f1_macro',
  cv=5,
  n_jobs=-1,
  verbose=1
)

gs.fit(X_train, y_train)
print("Best DT params:", gs.best_params_)

dt_best = gs.best_estimator_
dt_best_pred = dt_best.predict(X_test)
print("DT Tuned Accuracy:", accuracy_score(y_test, dt_best_pred))
print("DT Tuned Classification Report:\n", classification_report(y_test, dt_best_pred))
print("DT Tuned Confusion Matrix:\n", confusion_matrix(y_test, dt_best_pred))


In [None]:
# Save DT model and feature metadata
import joblib
artifacts = Path('artifacts_dt')
artifacts.mkdir(exist_ok=True)

joblib.dump(dt_best, artifacts / 'jet_fault_dt_model.pkl')
metadata = {
  'numeric_features': numeric_features,
  'target': TARGET_COL,
  'classes_': sorted(y.unique())
}
joblib.dump(metadata, artifacts / 'feature_metadata.pkl')
print('Saved Decision Tree model to artifacts_dt/jet_fault_dt_model.pkl')
