In [None]:
import yaml

with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Example usage:
classification_features = config['features']['classification']
regression_features = config['features']['regression']
classification_target = config['targets']['classification']
regression_target = config['targets']['regression']


In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Starting feature engineering process...")


In [None]:
# Cell 2: Load cleaned data
df = pd.read_csv('../data/processed/cleaned_flight_data.csv')
print(f"Data loaded. Shape: {df.shape}")
print("Columns:", df.columns.tolist())


In [None]:
# Cell 3: Create target variables
print("=== CREATING TARGET VARIABLES ===")

# Target 1: Binary classification - Is flight delayed (≥15 minutes)?
df['is_delayed'] = (df['arr_delay'] >= 15).astype(int)

# Target 2: Regression - Delay duration (only positive delays)
df['delay_duration'] = df['arr_delay'].clip(lower=0)

print(f"Delay classification distribution:")
print(df['is_delayed'].value_counts())
print(f"\nDelay percentage: {(df['is_delayed'].mean() * 100):.2f}%")
print(f"Average delay duration: {df['delay_duration'].mean():.2f} minutes")


In [None]:
# Cell 4: Feature selection based on correlation and domain knowledge
print("=== FEATURE SELECTION ===")

# Core operational features
base_features = [
    'arr_flights',      # Number of arrival flights
    'carrier_ct',       # Carrier delay count
    'weather_ct',       # Weather delay count  
    'nas_ct',          # NAS delay count
    'security_ct',     # Security delay count
    'late_aircraft_ct' # Late aircraft delay count
]

# Additional features for model
categorical_features = ['carrier', 'airport', 'month']
delay_features = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

# Check which features exist in the dataset
available_features = []
for feature in base_features + categorical_features + delay_features:
    if feature in df.columns:
        available_features.append(feature)
        
print("Available features for modeling:", available_features)


In [None]:
# Cell 5: Create derived features
print("=== CREATING DERIVED FEATURES ===")

# Total delay incidents
df['total_delay_incidents'] = (df['carrier_ct'] + df['weather_ct'] + 
                              df['nas_ct'] + df['security_ct'] + df['late_aircraft_ct'])

# Total delay minutes
df['total_delay_minutes'] = (df['carrier_delay'] + df['weather_delay'] + 
                           df['nas_delay'] + df['security_delay'] + df['late_aircraft_delay'])

# Delay rates
df['delay_incident_rate'] = df['total_delay_incidents'] / df['arr_flights'].replace(0, 1)

# Controllable vs uncontrollable delays (for OAI)
df['controllable_delays'] = df['carrier_ct'] + df['late_aircraft_ct']
df['uncontrollable_delays'] = df['weather_ct'] + df['security_ct']
df['controllable_delay_minutes'] = df['carrier_delay'] + df['late_aircraft_delay']

# Average delay per incident
df['avg_delay_per_incident'] = df['total_delay_minutes'] / df['total_delay_incidents'].replace(0, 1)

print("Derived features created successfully!")


In [None]:
# Cell 6: Handle categorical variables
print("=== ENCODING CATEGORICAL VARIABLES ===")

# Create a copy for modeling
df_model = df.copy()

# Label encoding for high cardinality features
label_encoders = {}

# Encode carrier
if 'carrier' in df_model.columns:
    le_carrier = LabelEncoder()
    df_model['carrier_encoded'] = le_carrier.fit_transform(df_model['carrier'].astype(str))
    label_encoders['carrier'] = le_carrier

# Encode airport  
if 'airport' in df_model.columns:
    le_airport = LabelEncoder()
    df_model['airport_encoded'] = le_airport.fit_transform(df_model['airport'].astype(str))
    label_encoders['airport'] = le_airport

# One-hot encoding for month (low cardinality)
if 'month' in df_model.columns:
    month_dummies = pd.get_dummies(df_model['month'], prefix='month')
    df_model = pd.concat([df_model, month_dummies], axis=1)

print("Categorical encoding completed!")


In [None]:
# Cell 7: Define feature sets for different models
print("=== DEFINING FEATURE SETS ===")

# Features for classification model (predict is_delayed)
classification_features = [
    'arr_flights',
    'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct',
    'total_delay_incidents', 'delay_incident_rate',
    'controllable_delays', 'uncontrollable_delays',
    'carrier_encoded', 'airport_encoded'
] + [col for col in df_model.columns if col.startswith('month_')]

# Features for regression model (predict delay_duration)
regression_features = classification_features + [
    'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
    'total_delay_minutes', 'controllable_delay_minutes', 'avg_delay_per_incident'
]

# Filter features that actually exist in the dataset
classification_features = [f for f in classification_features if f in df_model.columns]
regression_features = [f for f in regression_features if f in df_model.columns]

print(f"Classification features ({len(classification_features)}): {classification_features}")
print(f"Regression features ({len(regression_features)}): {regression_features}")


In [None]:
# Cell 8: Feature scaling and final preprocessing
print("=== FEATURE SCALING ===")

# Separate numerical features for scaling
numerical_features = ['arr_flights', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 
                     'late_aircraft_ct', 'total_delay_incidents', 'delay_incident_rate',
                     'controllable_delays', 'uncontrollable_delays']

if any('delay' in col and col not in ['is_delayed', 'delay_duration'] for col in df_model.columns):
    numerical_features.extend(['carrier_delay', 'weather_delay', 'nas_delay', 
                              'security_delay', 'late_aircraft_delay', 'total_delay_minutes',
                              'controllable_delay_minutes', 'avg_delay_per_incident'])

# Filter to existing columns
numerical_features = [f for f in numerical_features if f in df_model.columns]

# Create scalers
scaler_classification = StandardScaler()
scaler_regression = StandardScaler()

# Scale features for classification
X_classification = df_model[classification_features].copy()
X_classification[numerical_features] = scaler_classification.fit_transform(X_classification[numerical_features])

# Scale features for regression  
X_regression = df_model[regression_features].copy()
regression_numerical = [f for f in numerical_features if f in regression_features]
X_regression[regression_numerical] = scaler_regression.fit_transform(X_regression[regression_numerical])

print("Feature scaling completed!")


In [None]:
# Cell 9: Create final datasets and save
print("=== CREATING FINAL DATASETS ===")

# Classification dataset
classification_data = X_classification.copy()
classification_data['is_delayed'] = df_model['is_delayed']
classification_data['arr_delay'] = df_model['arr_delay']  # Keep for reference

# Regression dataset (only delayed flights)
regression_data = X_regression.copy()
regression_data['delay_duration'] = df_model['delay_duration']
regression_data['is_delayed'] = df_model['is_delayed']
regression_data['arr_delay'] = df_model['arr_delay']  # Keep for reference

# Save datasets
import os
os.makedirs('../data/processed', exist_ok=True)

classification_data.to_csv('../data/processed/classification_data.csv', index=False)
regression_data.to_csv('../data/processed/regression_data.csv', index=False)

# Save the full processed dataset
df_model.to_csv('../data/processed/model_ready_data.csv', index=False)

print("Datasets saved successfully!")
print(f"Classification data shape: {classification_data.shape}")
print(f"Regression data shape: {regression_data.shape}")


In [None]:
# Cell 10: Feature importance preview using correlation
print("=== FEATURE IMPORTANCE PREVIEW ===")

# Correlation with target variables
print("Top features correlated with is_delayed:")
classification_corr = classification_data.corr()['is_delayed'].abs().sort_values(ascending=False)
print(classification_corr.head(10))

print("\nTop features correlated with delay_duration:")
regression_corr = regression_data.corr()['delay_duration'].abs().sort_values(ascending=False)
print(regression_corr.head(10))

# Save feature lists for future reference
feature_info = {
    'classification_features': classification_features,
    'regression_features': regression_features,
    'numerical_features': numerical_features
}

import json
with open('../data/processed/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print("\nFeature engineering completed successfully!")
