In [None]:
# %% [markdown]
# # 06_utils_functions.ipynb
# 
# **Utility Functions for Flight Delay Analysis**
# 
# This notebook contains reusable functions for data loading, preprocessing, feature engineering, visualization, and model evaluation.
# 
# ---

# %% [markdown]
# ## 1. Imports

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, mean_absolute_error, r2_score

# %% [markdown]
# ## 2. Config Loading

# %%
def load_config(config_path='../config.yaml'):
    """
    Load the project configuration from config.yaml.
    """
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config

# %%
# Example usage:
# config = load_config()

# %% [markdown]
# ## 3. Data Loading

# %%
def load_data(file_path):
    """
    Load a CSV file into a pandas DataFrame.
    """
    return pd.read_csv(file_path)

# %%
# Example usage:
# df = load_data(config['data']['raw'])

# %% [markdown]
# ## 4. Data Cleaning

# %%
def clean_data(df, config):
    """
    Clean the dataset based on config settings.
    - Handle missing values
    - Remove duplicates
    - Clip negative delays
    """
    # Fill missing values in delay columns with 0
    delay_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
    for col in delay_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    
    # Fill missing count columns with 0
    count_cols = ['carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']
    for col in count_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    
    # Handle missing arr_delay
    if 'arr_delay' in df.columns:
        df['arr_delay'] = df['arr_delay'].fillna(0)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    return df

# %%
# Example usage:
# df = clean_data(df, config)

# %% [markdown]
# ## 5. Feature Engineering

# %%
def create_targets(df):
    """
    Create target columns for classification and regression.
    - is_delayed: 1 if arr_delay >= 15, else 0
    - delay_duration: max(0, arr_delay)
    """
    df['is_delayed'] = (df['arr_delay'] >= 15).astype(int)
    df['delay_duration'] = df['arr_delay'].clip(lower=0)
    return df

# %%
# Example usage:
# df = create_targets(df)

# %% [markdown]
# ## 6. Visualization

# %%
def plot_delay_distribution(df, save_path=None):
    """
    Plot the distribution of arrival delays.
    """
    plt.figure(figsize=(10, 5))
    sns.histplot(df['arr_delay'], bins=100, kde=True)
    plt.title('Arrival Delay Distribution')
    plt.xlabel('Delay (minutes)')
    plt.ylabel('Frequency')
    if save_path:
        plt.savefig(save_path)
    plt.show()

# %%
# Example usage:
# plot_delay_distribution(df, save_path=config['visualizations']['delay_distributions'] + 'delay_hist.png')

# %% [markdown]
# ## 7. Model Evaluation

# %%
def evaluate_classification(y_true, y_pred, y_pred_proba):
    """
    Print classification metrics and plot confusion matrix.
    """
    print(classification_report(y_true, y_pred))
    print(f"AUC-ROC: {roc_auc_score(y_true, y_pred_proba):.4f}")
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# %%
def evaluate_regression(y_true, y_pred):
    """
    Print regression metrics.
    """
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"RÂ²: {r2_score(y_true, y_pred):.4f}")

# %% [markdown]
# ## 8. Model Saving/Loading

# %%
def save_model(model, file_path):
    """
    Save a trained model to disk.
    """
    joblib.dump(model, file_path)

def load_model(file_path):
    """
    Load a trained model from disk.
    """
    return joblib.load(file_path)

# %% [markdown]
# ## 9. Example Usage

# %%
# # Load config
# config = load_config()
#
# # Load and clean data
# df = load_data(config['data']['raw'])
# df = clean_data(df, config)
#
# # Create targets
# df = create_targets(df)
#
# # Visualize
# plot_delay_distribution(df, save_path=config['visualizations']['delay_distributions'] + 'delay_hist.png')
#
# # ... use other functions as needed

# %% [markdown]
# ## 10. Notes

# - **Copy and paste these functions into other notebooks as needed.**
# - **Alternatively, use `%run 06_utils_functions.ipynb` to import functions (not always supported in all environments).**
# - **For best practices, consider moving these functions to a Python module (`.py` file) in a `src/` folder.**
