In [None]:
import yaml

with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Example usage:
model_params = config['modeling']['classification_model']['params']
features = config['features']['classification']
model_path = config['models']['classification_model']


In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Starting classification model development...")


In [None]:
# Cell 2: Load processed data
df = pd.read_csv('../data/processed/classification_data.csv')
print(f"Classification data loaded. Shape: {df.shape}")

# Load feature info
import json
with open('../data/processed/feature_info.json', 'r') as f:
    feature_info = json.load(f)

classification_features = feature_info['classification_features']
print(f"Number of features: {len(classification_features)}")


In [None]:
# Cell 3: Prepare data for modeling
print("=== PREPARING DATA FOR MODELING ===")

# Features and target
X = df[classification_features]
y = df['is_delayed']

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution:")
print(y.value_counts(normalize=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


In [None]:
# Cell 4: Model training - Random Forest
print("=== TRAINING RANDOM FOREST MODEL ===")

# Initialize Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest model trained successfully!")


In [None]:
# Cell 5: Model training - Logistic Regression
print("=== TRAINING LOGISTIC REGRESSION MODEL ===")

# Initialize Logistic Regression
lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced'
)

# Train the model
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]

print("Logistic Regression model trained successfully!")


In [None]:
# Cell 6: Model evaluation
print("=== MODEL EVALUATION ===")

# Function to evaluate model
def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    print(f"\n{model_name} Performance:")
    print("-" * 40)
    
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_proba)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"AUC-ROC:   {auc:.4f}")
    
    return {
        'accuracy': accuracy, 'precision': precision, 'recall': recall,
        'f1': f1, 'auc': auc
    }

# Evaluate both models
rf_metrics = evaluate_model(y_test, y_pred_rf, y_pred_proba_rf, "Random Forest")
lr_metrics = evaluate_model(y_test, y_pred_lr, y_pred_proba_lr, "Logistic Regression")


In [None]:
# Cell 7: Confusion Matrix and ROC Curve
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Random Forest Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0,0])
axes[0,0].set_title('Random Forest - Confusion Matrix')
axes[0,0].set_xlabel('Predicted')
axes[0,0].set_ylabel('Actual')

# Logistic Regression Confusion Matrix  
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=axes[0,1])
axes[0,1].set_title('Logistic Regression - Confusion Matrix')
axes[0,1].set_xlabel('Predicted')
axes[0,1].set_ylabel('Actual')

# ROC Curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)

axes[1,0].plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {rf_metrics["auc"]:.3f})')
axes[1,0].plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {lr_metrics["auc"]:.3f})')
axes[1,0].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[1,0].set_xlabel('False Positive Rate')
axes[1,0].set_ylabel('True Positive Rate')
axes[1,0].set_title('ROC Curves Comparison')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Feature Importance (Random Forest)
feature_importance = pd.DataFrame({
    'feature': classification_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False).head(15)

axes[1,1].barh(range(len(feature_importance)), feature_importance['importance'])
axes[1,1].set_yticks(range(len(feature_importance)))
axes[1,1].set_yticklabels(feature_importance['feature'])
axes[1,1].set_xlabel('Feature Importance')
axes[1,1].set_title('Top 15 Feature Importances (Random Forest)')

plt.tight_layout()
plt.savefig('../visualizations/model_results/classification_model_evaluation.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 8: SHAP Analysis
print("=== SHAP ANALYSIS ===")

# Create SHAP explainer for Random Forest (better performing model)
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test.iloc[:1000])  # Use subset for speed

# SHAP summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values[1], X_test.iloc[:1000], show=False)
plt.title('SHAP Summary Plot - Classification Model')
plt.tight_layout()
plt.savefig('../visualizations/model_results/shap_summary_classification.png', dpi=300, bbox_inches='tight')
plt.show()

# SHAP feature importance
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values[1], X_test.iloc[:1000], plot_type="bar", show=False)
plt.title('SHAP Feature Importance - Classification Model')
plt.tight_layout()
plt.savefig('../visualizations/model_results/shap_importance_classification.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 9: OAI (Operational Adjustability Index) Calculation
print("=== OPERATIONAL ADJUSTABILITY INDEX (OAI) ===")

def calculate_oai_classification(X, y_true, y_pred_proba):
    """
    Calculate OAI for classification model
    OAI prioritizes controllable delays (carrier, late_aircraft)
    """
    # Define controllable features (higher weights)
    controllable_features = ['carrier_ct', 'late_aircraft_ct', 'controllable_delays']
    controllable_weight = 2.0
    
    # Define uncontrollable features (lower weights)
    uncontrollable_features = ['weather_ct', 'security_ct', 'uncontrollable_delays']
    uncontrollable_weight = 0.5
    
    # Calculate weighted predictions
    oai_scores = []
    
    for idx in range(len(X)):
        row = X.iloc[idx]
        base_pred = y_pred_proba[idx]
        
        # Calculate controllable factor
        controllable_factor = 0
        for feature in controllable_features:
            if feature in X.columns:
                controllable_factor += row[feature] * controllable_weight
        
        # Calculate uncontrollable factor
        uncontrollable_factor = 0
        for feature in uncontrollable_features:
            if feature in X.columns:
                uncontrollable_factor += row[feature] * uncontrollable_weight
        
        # OAI score emphasizes controllable delays
        total_factor = controllable_factor + uncontrollable_factor
        if total_factor > 0:
            oai_score = base_pred * (controllable_factor / total_factor)
        else:
            oai_score = base_pred
            
        oai_scores.append(oai_score)
    
    return np.array(oai_scores)

# Calculate OAI scores
oai_scores = calculate_oai_classification(X_test, y_test, y_pred_proba_rf)

# OAI-based AUC
oai_auc = roc_auc_score(y_test, oai_scores)
standard_auc = roc_auc_score(y_test, y_pred_proba_rf)

print(f"Standard AUC: {standard_auc:.4f}")
print(f"OAI-weighted AUC: {oai_auc:.4f}")
print(f"OAI focuses on controllable delays - difference: {oai_auc - standard_auc:.4f}")


In [None]:
# Cell 10: Model Selection and Saving
print("=== MODEL SELECTION AND SAVING ===")

# Select best model based on metrics
if rf_metrics['f1'] > lr_metrics['f1']:
    best_model = rf_model
    best_model_name = "Random Forest"
    best_metrics = rf_metrics
    best_predictions = y_pred_proba_rf
else:
    best_model = lr_model
    best_model_name = "Logistic Regression"  
    best_metrics = lr_metrics
    best_predictions = y_pred_proba_lr

print(f"Best model: {best_model_name}")
print(f"Best F1-Score: {best_metrics['f1']:.4f}")

# Save the best model
import os
os.makedirs('../data/models/trained_models', exist_ok=True)

joblib.dump(best_model, '../data/models/trained_models/classification_model.pkl')
joblib.dump(explainer, '../data/models/trained_models/classification_explainer.pkl')

# Save model performance metrics
model_results = {
    'model_type': 'classification',
    'best_model': best_model_name,
    'metrics': {
        'random_forest': rf_metrics,
        'logistic_regression': lr_metrics
    },
    'oai_metrics': {
        'standard_auc': float(standard_auc),
        'oai_auc': float(oai_auc)
    },
    'feature_count': len(classification_features),
    'test_size': len(X_test)
}

import json
with open('../data/models/trained_models/classification_results.json', 'w') as f:
    json.dump(model_results, f, indent=2)

print("Classification model saved successfully!")
print(f"Model file: classification_model.pkl")
print(f"Results file: classification_results.json")


In [None]:
# Cell 11: Final Summary
print("=== CLASSIFICATION MODEL SUMMARY ===")
print(f"✅ Model Type: Flight Delay Classification (Yes/No)")
print(f"✅ Best Model: {best_model_name}")
print(f"✅ Dataset Size: {len(df):,} records")
print(f"✅ Features Used: {len(classification_features)}")
print(f"✅ Test Set Performance:")
print(f"   - Accuracy:  {best_metrics['accuracy']:.4f}")
print(f"   - Precision: {best_metrics['precision']:.4f}")
print(f"   - Recall:    {best_metrics['recall']:.4f}")
print(f"   - F1-Score:  {best_metrics['f1']:.4f}")
print(f"   - AUC-ROC:   {best_metrics['auc']:.4f}")
print(f"✅ OAI Analysis: Completed (AUC: {oai_auc:.4f})")
print(f"✅ SHAP Analysis: Completed")
print(f"✅ Model Saved: ../data/models/trained_models/classification_model.pkl")
