In [None]:

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

df = pd.read_csv("airline_delay.csv")

print("Dataset Info:")
print(df.info())

print("\nMissing Values Before Preprocessing:")
print(df.isnull().sum())

# dropping columns with a high percentage of missing values
df = df.drop(columns=["CANCELLATION_REASON"])

#  categorical columns are of string type
df["ORIGIN_AIRPORT"] = df["ORIGIN_AIRPORT"].astype(str)
df["DESTINATION_AIRPORT"] = df["DESTINATION_AIRPORT"].astype(str)

# categorical and numeric 
categorical_features = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"]
numeric_features = [
    "YEAR", "MONTH", "DAY", "DAY_OF_WEEK", "FLIGHT_NUMBER",
    "SCHEDULED_DEPARTURE", "DEPARTURE_TIME", "DEPARTURE_DELAY",
    "TAXI_OUT", "WHEELS_OFF", "SCHEDULED_TIME", "ELAPSED_TIME",
    "AIR_TIME", "DISTANCE", "WHEELS_ON", "TAXI_IN",
    "SCHEDULED_ARRIVAL", "ARRIVAL_TIME", "DIVERTED", "CANCELLED"
]

# here preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  #  missing values with median
    ("scaler", StandardScaler())  # standardize numeric 
])

# preprocessing  categorical
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  #Fill missing values with mode
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # Onehot Encoding
])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

print(df.info())
print(df.info())
print(df.info())
print(df.info())
# preprocessing pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])


X = df.drop(columns=["ARRIVAL_DELAY"])  
y = df["ARRIVAL_DELAY"]  


X_processed = pipeline.fit_transform(X)


processed_columns = numeric_features + list(pipeline.named_steps["preprocessor"]
                                           .named_transformers_["cat"]
                                           .named_steps["onehot"]
                                           .get_feature_names_out(categorical_features))
df_processed = pd.DataFrame(X_processed, columns=processed_columns)

# target variable back to the processed DataFrame
df_processed["ARRIVAL_DELAY"] = y

# new features 
def create_new_features(df):
   
    df["TOTAL_DELAY"] = df["DEPARTURE_DELAY"] + df["ARRIVAL_DELAY"]
    return df


df_processed = create_new_features(df_processed)

df_processed.to_csv("airline_delay.csv", index=False)

print("\nPreprocessing completed. The dataset has been updated and saved to 'airline_delay.csv'.")