In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv('data/flight_weather_merged.csv')
df.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,Origin,OriginCityName,...,HourlyWindGustSpeed,HourlyWindSpeed,time,HourlyWeatherType,HourlySkyCondit,wind_dir_sin,wind_dir_cos,wind_dir_vrb,airport,date
0,1,1,1,1,2024-01-01,AA,N871NN,1653.0,SDF,"Louisville, KY",...,0.0,7.0,500,snow,cloudy,-0.939693,0.34202,0.0,SDF,2024-01-01
1,1,1,1,1,2024-01-01,NK,N657NK,1201.0,SDF,"Louisville, KY",...,0.0,8.0,600,snow,cloudy,-0.866025,0.5,0.0,SDF,2024-01-01
2,1,1,1,1,2024-01-01,DL,N844DN,2574.0,SDF,"Louisville, KY",...,0.0,8.0,600,snow,cloudy,-0.866025,0.5,0.0,SDF,2024-01-01
3,1,1,1,1,2024-01-01,WN,N8789Q,3175.0,SDF,"Louisville, KY",...,0.0,8.0,600,snow,cloudy,-0.866025,0.5,0.0,SDF,2024-01-01
4,1,1,1,1,2024-01-01,AA,N662AW,1694.0,SDF,"Louisville, KY",...,0.0,8.0,600,snow,cloudy,-0.866025,0.5,0.0,SDF,2024-01-01


---

# All-in-one preprocessing for merged DF

In [15]:
def encode_time_hhmm(series, prefix):
    """
    Encode HHMM time values into cyclical sine and cosine features.

    The function converts time represented in HHMM format (e.g., 1345)
    into minutes since midnight and encodes it using sine and cosine
    transformation to preserve cyclical continuity (00:00 ≈ 23:59).

    Invalid or missing values are coerced to 0.

    Args:
        series (pd.Series): Time values in HHMM format.
        prefix (str): Prefix for generated feature names.

    Returns:
        pd.DataFrame: DataFrame containing:
            - {prefix}_sin
            - {prefix}_cos
        with the same index as the input series.
    """
    
    s = pd.to_numeric(series, errors="coerce").fillna(0).astype(int)
    hh = (s // 100).clip(0, 23)
    mm = (s % 100).clip(0, 59)
    minutes = hh * 60 + mm
    angle = 2 * np.pi * minutes / 1440

    return pd.DataFrame({
        f"{prefix}_sin": np.sin(angle),
        f"{prefix}_cos": np.cos(angle),
    }, index=series.index)
#------------------------------------------------------
#------------------------------------------------------
def encode_quarter(series, prefix="Quarter"):
    """
    Encode quarter values (1–4) into cyclical sine and cosine features.

    The function applies cyclical encoding to represent seasonality
    across quarters while preserving periodic structure.

    Missing or invalid values default to quarter 1.

    Args:
        series (pd.Series): Quarter values (1–4).
        prefix (str, optional): Prefix for generated feature names.

    Returns:
        pd.DataFrame: DataFrame containing:
            - {prefix}_sin
            - {prefix}_cos
        indexed identically to the input.
    """
    
    q = pd.to_numeric(series, errors="coerce").fillna(1).astype(int)
    angle = 2 * np.pi * (q - 1) / 4

    return pd.DataFrame({
        f"{prefix}_sin": np.sin(angle),
        f"{prefix}_cos": np.cos(angle),
    }, index=series.index)
#------------------------------------------------------
#------------------------------------------------------
def encode_month(series, prefix="Month"):
    """
    Encode month values (1–12) into cyclical sine and cosine features.

    This transformation captures annual seasonality while avoiding
    artificial discontinuity between December (12) and January (1).

    Missing or invalid values default to month 1.

    Args:
        series (pd.Series): Month values (1–12).
        prefix (str, optional): Prefix for generated feature names.

    Returns:
        pd.DataFrame: DataFrame containing:
            - {prefix}_sin
            - {prefix}_cos
        aligned with the input index.
    """
    
    m = pd.to_numeric(series, errors="coerce").fillna(1).astype(int)
    angle = 2 * np.pi * (m - 1) / 12

    return pd.DataFrame({
        f"{prefix}_sin": np.sin(angle),
        f"{prefix}_cos": np.cos(angle),
    }, index=series.index)
#------------------------------------------------------
#------------------------------------------------------
def encode_day_of_week(series, prefix="DayOfWeek"):
    """
    Encode day-of-week values (1–7) into cyclical sine and cosine features.

    The encoding preserves weekly periodicity and avoids artificial
    separation between the last and first weekday.

    Missing or invalid values default to day 1.

    Args:
        series (pd.Series): Day-of-week values (1–7).
        prefix (str, optional): Prefix for generated feature names.

    Returns:
        pd.DataFrame: DataFrame containing:
            - {prefix}_sin
            - {prefix}_cos
        with index preserved.
    """
    d = pd.to_numeric(series, errors="coerce").fillna(1).astype(int)
    angle = 2 * np.pi * (d - 1) / 7

    return pd.DataFrame({
        f"{prefix}_sin": np.sin(angle),
        f"{prefix}_cos": np.cos(angle),
    }, index=series.index)
#------------------------------------------------------
#------------------------------------------------------
def encode_day_of_month(series, prefix="DayOfMonth"):
    """
    Encode day-of-month values (1–31) into cyclical sine and cosine features.

    This transformation models intra-month seasonality while maintaining
    cyclical continuity between the end and start of the month.

    Missing or invalid values default to day 1.

    Args:
        series (pd.Series): Day-of-month values (1–31).
        prefix (str, optional): Prefix for generated feature names.

    Returns:
        pd.DataFrame: DataFrame containing:
            - {prefix}_sin
            - {prefix}_cos
        preserving the original index.
    """
    d = pd.to_numeric(series, errors="coerce").fillna(1).astype(int)
    angle = 2 * np.pi * (d - 1) / 31

    return pd.DataFrame({
        f"{prefix}_sin": np.sin(angle),
        f"{prefix}_cos": np.cos(angle),
    }, index=series.index)


In [16]:
#------------------------------------------------------

In [17]:
def preprocess_merged_data(df):
    """
    Perform feature engineering and preprocessing on merged flight-weather data.

    This function applies cyclical encoding to temporal features,
    removes redundant and leakage-prone columns, and prepares the dataset
    for machine learning modeling.

    Processing steps:
        - Encode Quarter, Month, DayOfWeek, and DayofMonth using cyclical
          sine/cosine transformation.
        - Encode time-based features (WheelsOff, CRSArrTime, CRSDepTime,
          DepTime) using HHMM cyclical encoding.
        - Drop raw time columns after encoding.
        - Remove high-cardinality identifiers and non-predictive metadata.
        - Drop delay component columns that may introduce data leakage.
        - Remove duplicate or redundant delay-related columns.

    Args:
        df (pd.DataFrame): Merged flight and weather dataset containing
            temporal, operational, and meteorological features.

    Returns:
        pd.DataFrame: Preprocessed dataset ready for modeling,
        with engineered cyclical features and irrelevant columns removed.

    Notes:
        - The input DataFrame is copied to avoid in-place modification.
        - Cyclical encoding preserves periodic structure of temporal features.
        - Certain arrival-related columns are removed to prevent target leakage.
    """
    
    df = df.copy()
    
    # Quarter
    df = df.join(encode_quarter(df["Quarter"], "Quarter"))
    #df = df.drop(columns=["Quarter"])
    
    # Month
    df = df.join(encode_month(df["Month"], "Month"))
    #df = df.drop(columns=["Month"])

    # Day of Week
    df = df.join(encode_day_of_week(df["DayOfWeek"], "DayOfWeek"))
    #df = df.drop(columns=["DayOfWeek"])
    
    
    # Day of Month
    df = df.join(encode_day_of_month(df["DayofMonth"], "DayofMonth"))
    #df = df.drop(columns=["DayofMonth"])
    
    
    # Drop columns
    col_to_drop = ['FlightDate', 'Tail_Number', 'Flight_Number_Reporting_Airline', 'OriginCityName', 'OriginState', 'DestCityName', 'DestState', 'WheelsOn', "TaxiIn", "ArrTime","ArrDelay",  "ArrivalDelayGroups", "ActualElapsedTime", "AirTime", "ArrTimeBlk", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "dt", "date", "time", "airport"]
    
    df = df.drop(columns=col_to_drop, axis=1)
    
    # Wheels Off time Encoding 
    df = df.join(encode_time_hhmm(df["WheelsOff"], "WheelsOff"))
    df = df.drop(columns=["WheelsOff"])

    # CRSArrTime Encoding
    df = df.join(encode_time_hhmm(df["CRSArrTime"], "CRSArrTime"))
    df = df.drop(columns=["CRSArrTime"])
    
    # CRSDepTime Encoding
    df = df.join(encode_time_hhmm(df["CRSDepTime"], "CRSDepTime"))
    df = df.drop(columns=["CRSDepTime"])
    
    # DepTime Encodingg
    df = df.join(encode_time_hhmm(df["DepTime"], "DepTime"))
    df = df.drop(columns=["DepTime"])
    
    # Drop duplicate columns
    df = df.drop(columns=["DepDelayMinutes"])
    df = df.drop(columns=["DepartureDelayGroups"])
    
    return df

# Feature Engineering

In [19]:
def feature_engineering(df):
    """
    Generate additional predictive features from flight and weather data.

    This function creates derived features designed to improve model
    performance by capturing operational intensity, congestion signals,
    temporal shifts, and adverse weather effects.

    Feature groups created:
        - Delay normalization and congestion indicators
        - Departure time shift (actual vs scheduled)
        - Night and evening departure flags
        - Weather severity indicators
        - Aggregated bad weather score
        - Hub route indicator
        - Roundtrip flag (SDF-based)

    Feature details:
        - dep_delay_ratio: Departure delay normalized by scheduled elapsed time.
        - dep_delay_gt30 / dep_delay_gt60: Binary indicators of severe delay.
        - taxiout_long: Indicator of unusually long taxi-out time (> 75th percentile).
        - dep_time_shift_sin/cos: Difference between actual and scheduled
          cyclical departure encodings.
        - is_night_departure / is_evening_departure: Time-of-day flags
          derived from cyclical encoding.
        - low_visibility / strong_wind / wind_gust: Weather severity flags.
        - bad_weather_score: Aggregate score based on precipitation,
          reduced visibility, and elevated wind speed.
        - is_hub_route: Indicates whether the route involves a major U.S. hub.
        - is_roundtrip: Indicates flights departing from SDF.

    Args:
        df (pd.DataFrame): Preprocessed flight-weather dataset containing
            operational, temporal, and meteorological features.

    Returns:
        pd.DataFrame: DataFrame with additional engineered features
        appended. Original columns are preserved.

    Notes:
        - The input DataFrame is copied to avoid in-place modification.
    """
    
    df = df.copy()
    
    # dep_delay_ratio -> Normalize flight delay
    df['dep_delay_ratio'] = df['DepDelay'] / df['CRSElapsedTime']
    
    # Serious congestion flags
    df["dep_delay_gt30"] = (df["DepDelay"] > 30).astype(int)
    df["dep_delay_gt60"] = (df["DepDelay"] > 60).astype(int)
    
    # taxiout_long - also flag of congestion
    df["taxiout_long"] = (df["TaxiOut"] > df["TaxiOut"].quantile(0.75)).astype(int)
    
    # Diff between Planned and Actual departure
    df["dep_time_shift_sin"] = df["DepTime_sin"] - df["CRSDepTime_sin"]
    df["dep_time_shift_cos"] = df["DepTime_cos"] - df["CRSDepTime_cos"]
    
    # Night/Evening flags
    df["is_night_departure"] = (df["CRSDepTime_cos"] > 0.7).astype(int)
    df["is_evening_departure"] = (df["CRSDepTime_sin"] < -0.7).astype(int)
    
    # Weather conditions 
    df["low_visibility"] = (df["HourlyVisibility"] < 3).astype(int)
    df["strong_wind"] = (df["HourlyWindSpeed"] > 20).astype(int)
    df["wind_gust"] = (df["HourlyWindGustSpeed"] > 25).astype(int)
    
    # Bad Weather Score
    df["bad_weather_score"] = (
        (df["HourlyPrecipitation"] > 0).astype(int) +
        (df["HourlyVisibility"] < 5).astype(int) +
        (df["HourlyWindSpeed"] > 15).astype(int)
    )

    # Hub routes
    hub_airports = ["ATL", "ORD", "DFW", "DEN", "CLT"]
    df["is_hub_route"] = (
        df["Origin"].isin(hub_airports) |
        df["Dest"].isin(hub_airports)
    ).astype(int)
    
    # Is roundtrip
    df["is_roundtrip"] = (df["Origin"] == "SDF").astype(int)
    
    return df

In [20]:
df = preprocess_merged_data(df)
df_fe = feature_engineering(df)

In [21]:
df_fe.to_csv('data/ready_df_with_dep.csv', index=False)

---