In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [36]:
# 1. Load the data
# Assume the dataset is saved as 'airline_delay.csv'
def load_data(filepath):
    df = pd.read_csv(filepath)
    print(f"Dataset shape: {df.shape}")
    print("\nSample data:")
    print(df.head())
    print("\nData types:")
    print(df.dtypes)
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\nClass distribution:")
    print(df['Class'].value_counts())
    return df

def preprocess_data(df):
    data = df.copy()

    for col in data.columns:
        if data[col].isnull().sum() > 0:
            if data[col].dtype == 'object':
                data[col].fillna(data[col].mode()[0], inplace=True)
            else:
                data[col].fillna(data[col].median(), inplace=True)

    numeric_features = ['Time', 'Length'] # removed Flight
    categorical_features = ['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='median'), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    X = data.drop('Class', axis=1)
    y = data['Class']

    return X, y, preprocessor, numeric_features, categorical_features

# 3. Model evaluation with cross-validation
def evaluate_model(model, X, y, preprocessor, cv=5, balance=False):
    # Create full pipeline with preprocessing
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # For storing metrics
    y_true_all = []
    y_pred_all = []
    y_prob_all = []

    # Cross-validation loop
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Apply class balancing if requested
        if balance:
            smote = SMOTE(random_state=42)
            X_train_preprocessed = preprocessor.fit_transform(X_train)
            X_train_preprocessed, y_train = smote.fit_resample(X_train_preprocessed, y_train)

            # Fit on balanced data
            pipeline.steps[0] = ('preprocessor', 'passthrough')  # Skip preprocessing as it's already done
            pipeline.fit(X_train_preprocessed, y_train)

            # For prediction, we need to preprocess test data
            X_test_preprocessed = preprocessor.transform(X_test)
            y_pred = pipeline.predict(X_test_preprocessed)
            y_prob = pipeline.predict_proba(X_test_preprocessed)[:, 1]
        else:
            # Regular fit without balancing
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            y_prob = pipeline.predict_proba(X_test)[:, 1]

        # Collect predictions
        y_true_all.extend(y_test)
        y_pred_all.extend(y_pred)
        y_prob_all.extend(y_prob)

    # Calculate metrics
    conf_matrix = confusion_matrix(y_true_all, y_pred_all)
    accuracy = accuracy_score(y_true_all, y_pred_all)
    precision = precision_score(y_true_all, y_pred_all, average=None)
    recall = recall_score(y_true_all, y_pred_all, average=None)
    f1 = f1_score(y_true_all, y_pred_all, average=None)
    f1_global = f1_score(y_true_all, y_pred_all, average='weighted')

    # Calculate AUC if possible
    try:
        auc = roc_auc_score(y_true_all, y_prob_all)
        fpr, tpr, _ = roc_curve(y_true_all, y_prob_all)
    except:
        auc = None
        fpr, tpr = None, None

    results = {
        'confusion_matrix': conf_matrix,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'f1_global': f1_global,
        'auc': auc,
        'roc_curve': (fpr, tpr),
        'y_true': y_true_all,
        'y_pred': y_pred_all,
        'y_prob': y_prob_all
    }

    return results

def report_results(model_name, results):
    print(f"\n=== {model_name} Results ===")
    print(f"Confusion Matrix:\n{results['confusion_matrix']}")
    print(f"Accuracy: {results['accuracy']:.4f}")

    # For binary classification
    print(f"Precision - Class 0: {results['precision'][0]:.4f}, Class 1: {results['precision'][1]:.4f}")
    print(f"Recall - Class 0: {results['recall'][0]:.4f}, Class 1: {results['recall'][1]:.4f}")
    print(f"F1 Score - Class 0: {results['f1_score'][0]:.4f}, Class 1: {results['f1_score'][1]:.4f}")
    print(f"Global F1 Score: {results['f1_global']:.4f}")

    if results['auc'] is not None:
        print(f"AUC: {results['auc']:.4f}")

    # Plot ROC curve if available
    if results['roc_curve'][0] is not None:
        plt.figure(figsize=(8, 6))
        plt.plot(results['roc_curve'][0], results['roc_curve'][1],
                 lw=2, label=f'{model_name} (AUC = {results["auc"]:.4f})')
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend(loc="lower right")
        plt.show()

    return

In [37]:
def optimize_decision_tree(X, y, preprocessor, cv=5):
    print("\n=== Decision Tree Optimization ===")

    # Parameter grid
    param_grid = {
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth': [None, 5, 10, 15, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }

    # Create base pipelne
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])

    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X, y)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    best_dt = DecisionTreeClassifier(**{k.replace('classifier__', ''): v
                                      for k, v in grid_search.best_params_.items()
                                      if k.startswith('classifier__')},
                                   random_state=42)

    # Visualize tree (optional)
    # We need to apply preprocessing first to visualize
    X_processed = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler(with_mean=False))
    ]).fit_transform(X)

    best_dt.fit(X_processed, y)

    plt.figure(figsize=(20, 10))
    plot_tree(best_dt, filled=True, max_depth=3, feature_names=None)
    plt.title("Decision Tree (Limited to depth 3 for visualization)")
    plt.show()

    # Return both the best model and feature importances
    importances = best_dt.feature_importances_

    return best_dt, importances


In [38]:
 df = load_data('airlines_delay.csv')

Dataset shape: (539382, 8)

Sample data:
   Flight    Time  Length Airline AirportFrom AirportTo  DayOfWeek  Class
0  2313.0  1296.0   141.0      DL         ATL       HOU          1      0
1  6948.0   360.0   146.0      OO         COS       ORD          4      0
2  1247.0  1170.0   143.0      B6         BOS       CLT          3      0
3    31.0  1410.0   344.0      US         OGG       PHX          6      0
4   563.0   692.0    98.0      FL         BMI       ATL          4      0

Data types:
Flight         float64
Time           float64
Length         float64
Airline         object
AirportFrom     object
AirportTo       object
DayOfWeek        int64
Class            int64
dtype: object

Missing values:
Flight         0
Time           0
Length         0
Airline        0
AirportFrom    0
AirportTo      0
DayOfWeek      0
Class          0
dtype: int64

Class distribution:
Class
0    299118
1    240264
Name: count, dtype: int64


In [39]:
 X, y, preprocessor, numeric_features, categorical_features = preprocess_data(df)

In [40]:
best_models = {}

In [41]:
best_dt, dt_importances = optimize_decision_tree(X, y, preprocessor)
best_models['dt'] = best_dt
dt_results = evaluate_model(best_dt, X, y, preprocessor)
report_results("Decision Tree", dt_results)


=== Decision Tree Optimization ===
Best parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 20, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10}
Best cross-validation score: 0.6403


ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.