In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

file_path = 'dataset/Combined_Flights_2022.csv'  # Replace with your dataset path
df = pd.read_csv(file_path)
print(df.head())

   FlightDate                                    Airline Origin Dest  \
0  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    GJT  DEN   
1  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    HRL  IAH   
2  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   
3  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    IAH  GPT   
4  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   

   Cancelled  Diverted  CRSDepTime  DepTime  DepDelayMinutes  DepDelay  ...  \
0      False     False        1133   1123.0              0.0     -10.0  ...   
1      False     False         732    728.0              0.0      -4.0  ...   
2      False     False        1529   1514.0              0.0     -15.0  ...   
3      False     False        1435   1430.0              0.0      -5.0  ...   
4      False     False        1135   1135.0              0.0       0.0  ...   

   WheelsOff  WheelsOn  TaxiIn  CRSArrTime  ArrDelay  ArrDel15  \
0     1140.0    1220.0    

In [2]:
# Drop unnecessary columns if any
columns_to_drop = [
    'FlightDate', 'Tail_Number', 'Operated_or_Branded_Code_Share_Partners',
    'OriginStateName', 'DestStateName', 'OriginCityName', 'DestCityName'
]
df = df.drop(columns=columns_to_drop, errors='ignore')

# Handling Missing Values
df = df.ffill()  # forward fill as an example, adjust as needed

# Optional: Subset data for testing (e.g., use 10% of data for faster model training)
df = df.sample(frac=0.3, random_state=42)  # Adjust fraction to balance time and performance

# One-Hot Encode all categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Feature Engineering: Create a target column
df['Delay'] = df['DepDelayMinutes'].apply(lambda x: 1 if x > 15 else 0)
df = df.drop(['DepDelayMinutes', 'ArrDelayMinutes'], axis=1)

# Split into features and target
X = df.drop('Delay', axis=1)
y = df['Delay']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale numeric features only
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
print(categorical_columns)
print(df.head())

Index(['Airline', 'Origin', 'Dest', 'Marketing_Airline_Network',
       'IATA_Code_Marketing_Airline', 'Operating_Airline',
       'IATA_Code_Operating_Airline', 'OriginState', 'DestState', 'DepTimeBlk',
       'ArrTimeBlk'],
      dtype='object')
         Cancelled  Diverted  CRSDepTime  DepTime  DepDelay  ArrTime  AirTime  \
3811797      False     False        1831   1826.0      -5.0   1923.0     31.0   
615029       False     False        1605   1605.0       0.0   1812.0    194.0   
3228533      False     False        1719   1714.0      -5.0   2052.0    314.0   
3129490      False     False        1515   1533.0      18.0   2302.0    251.0   
1273418      False     False         715    709.0      -6.0    837.0     73.0   

         CRSElapsedTime  ActualElapsedTime  Distance  ...  \
3811797            62.0               57.0     125.0  ...   
615029            212.0              247.0    1162.0  ...   
3228533           330.0              338.0    2486.0  ...   
3129490           260

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Using the best parameters found initially: {'class_weight': 'balanced', 'max_depth': 10, 'n_estimators': 50}
rf_model = RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=30, random_state=42)
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=3, scoring='f1')
print("Cross-validated F1 scores:", cv_scores)
print("Average F1 score:", cv_scores.mean())

# Step 3: Train Final Model on Larger Data with Best Parameters
rf_model.fit(X_train, y_train)

# Predictions and Probabilities
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Model Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_prob))

# Step 4: Feature Importance Analysis
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display top 10 features
print("Top 10 Important Features:\n", feature_importance_df.head(10))

Cross-validated F1 scores: [0.99009341 0.98608247 0.986011  ]
Average F1 score: 0.9873956268001883
Confusion Matrix:
 [[286797   1831]
 [     0  78421]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    288628
           1       0.98      1.00      0.99     78421

    accuracy                           1.00    367049
   macro avg       0.99      1.00      0.99    367049
weighted avg       1.00      1.00      1.00    367049

AUC-ROC Score: 0.9999708252399929
Top 10 Important Features:
                  Feature  Importance
30  DepartureDelayGroups    0.320494
36              ArrDelay    0.143759
29              DepDel15    0.140702
4               DepDelay    0.139985
38    ArrivalDelayGroups    0.110194
37              ArrDel15    0.064025
32             WheelsOff    0.015286
5                ArrTime    0.010297
2             CRSDepTime    0.008981
33              WheelsOn    0.005791
