In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
file_path = 'Combined_Flights_2022.csv'  # Replace with your dataset path
df = pd.read_csv(file_path)
print(df.head())

   FlightDate                                    Airline Origin Dest  \
0  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    GJT  DEN   
1  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    HRL  IAH   
2  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   
3  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    IAH  GPT   
4  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   

   Cancelled  Diverted  CRSDepTime  DepTime  DepDelayMinutes  DepDelay  ...  \
0      False     False        1133   1123.0              0.0     -10.0  ...   
1      False     False         732    728.0              0.0      -4.0  ...   
2      False     False        1529   1514.0              0.0     -15.0  ...   
3      False     False        1435   1430.0              0.0      -5.0  ...   
4      False     False        1135   1135.0              0.0       0.0  ...   

   WheelsOff  WheelsOn  TaxiIn  CRSArrTime  ArrDelay  ArrDel15  \
0     1140.0    1220.0    

In [10]:
# Drop unnecessary columns if any
columns_to_drop = [
    'FlightDate', 'Tail_Number', 'Operated_or_Branded_Code_Share_Partners',
    'OriginStateName', 'DestStateName', 'OriginCityName', 'DestCityName'
]
df = df.drop(columns=columns_to_drop, errors='ignore')

# Handling Missing Values
df = df.ffill()  # forward fill as an example, adjust as needed

# Optional: Subset data for testing (e.g., use 10% of data for faster model training)
df = df.sample(frac=0.2, random_state=42)  # Adjust fraction to balance time and performance

# One-Hot Encode all categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Feature Engineering: Create a target column
df['Delay'] = df['DepDelayMinutes'].apply(lambda x: 1 if x > 15 else 0)
df = df.drop(['DepDelayMinutes', 'ArrDelayMinutes'], axis=1)

# Split into features and target
X = df.drop('Delay', axis=1)
y = df['Delay']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale numeric features only
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
print(categorical_columns)
print(df.head())

Index(['Airline', 'Origin', 'Dest', 'Marketing_Airline_Network',
       'IATA_Code_Marketing_Airline', 'Operating_Airline',
       'IATA_Code_Operating_Airline', 'OriginState', 'DestState', 'DepTimeBlk',
       'ArrTimeBlk'],
      dtype='object')
         Cancelled  Diverted  CRSDepTime  DepTime  DepDelay  ArrTime  AirTime  \
3811797      False     False        1831   1826.0      -5.0   1923.0     31.0   
615029       False     False        1605   1605.0       0.0   1812.0    194.0   
3228533      False     False        1719   1714.0      -5.0   2052.0    314.0   
3129490      False     False        1515   1533.0      18.0   2302.0    251.0   
1273418      False     False         715    709.0      -6.0    837.0     73.0   

         CRSElapsedTime  ActualElapsedTime  Distance  ...  \
3811797            62.0               57.0     125.0  ...   
615029            212.0              247.0    1162.0  ...   
3228533           330.0              338.0    2486.0  ...   
3129490           260

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# # Define the parameter grid
# param_grid_rf = {
#     'n_estimators': [50],         # Start with fewer trees
#     'max_depth': [10],            # Limit tree depth to control complexity
#     'class_weight': ['balanced']
# }

# rf = RandomForestClassifier(random_state=42, n_jobs=-1)  # Use all CPU cores
# grid_rf = GridSearchCV(rf, param_grid_rf, scoring='f1', cv=5)
# grid_rf.fit(X_train, y_train)
# # 
# # Retrieve the best estimator
# best_rf = grid_rf.best_estimator_
# print("Best parameters for Random Forest:", grid_rf.best_params_)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Using the best parameters found initially: {'class_weight': 'balanced', 'max_depth': 10, 'n_estimators': 50}
rf_model = RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=30, random_state=42)
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=3, scoring='f1')
print("Cross-validated F1 scores:", cv_scores)
print("Average F1 score:", cv_scores.mean())

# Step 3: Train Final Model on Larger Data with Best Parameters
rf_model.fit(X_train, y_train)

# Predictions and Probabilities
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Model Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_prob))

# Step 4: Feature Importance Analysis
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display top 10 features
print("Top 10 Important Features:\n", feature_importance_df.head(10))

Cross-validated F1 scores: [0.9929949  0.98868282 0.98620656]
Average F1 score: 0.989294757924104
Confusion Matrix:
 [[190561   1319]
 [     0  52820]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    191880
           1       0.98      1.00      0.99     52820

    accuracy                           0.99    244700
   macro avg       0.99      1.00      0.99    244700
weighted avg       0.99      0.99      0.99    244700

AUC-ROC Score: 0.9998102345614375
Top 10 Important Features:
                  Feature  Importance
30  DepartureDelayGroups    0.326621
29              DepDel15    0.165277
36              ArrDelay    0.121037
4               DepDelay    0.104191
38    ArrivalDelayGroups    0.098288
37              ArrDel15    0.082403
32             WheelsOff    0.015200
5                ArrTime    0.012642
2             CRSDepTime    0.011004
3                DepTime    0.010100


In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the model without early stopping
xgb.fit(X_train, y_train, verbose=True)

# Predictions and evaluation
y_pred_xgb = xgb.predict(X_test)

# Confusion Matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix:\n", conf_matrix_xgb)

# Classification Report
class_report_xgb = classification_report(y_test, y_pred_xgb)
print("Classification Report:\n", class_report_xgb)

# AUC-ROC Score
roc_auc_xgb = roc_auc_score(y_test, xgb.predict_proba(X_test)[:, 1])
print("AUC-ROC Score:", roc_auc_xgb)


Parameters: { "use_label_encoder" } are not used.



Confusion Matrix:
 [[191880      0]
 [     0  52820]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    191880
           1       1.00      1.00      1.00     52820

    accuracy                           1.00    244700
   macro avg       1.00      1.00      1.00    244700
weighted avg       1.00      1.00      1.00    244700

AUC-ROC Score: 1.0


In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score

# Initialize the XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the cross-validation strategy (e.g., 5-fold stratified cross-validation)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'f1': 'f1'
}

# Perform cross-validation with multiple scoring metrics
cv_results = cross_validate(xgb, X, y, cv=cv, scoring=scoring, n_jobs=-1)

# Print cross-validated scores
print("Cross-validated Accuracy scores:", cv_results['test_accuracy'])
print("Cross-validated ROC AUC scores:", cv_results['test_roc_auc'])
print("Cross-validated F1 scores:", cv_results['test_f1'])
print("Average Accuracy:", cv_results['test_accuracy'].mean())
print("Average ROC AUC:", cv_results['test_roc_auc'].mean())
print("Average F1 score:", cv_results['test_f1'].mean())


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validated Accuracy scores: [1. 1. 1. 1. 1.]
Cross-validated ROC AUC scores: [1. 1. 1. 1. 1.]
Cross-validated F1 scores: [1. 1. 1. 1. 1.]
Average Accuracy: 1.0
Average ROC AUC: 1.0
Average F1 score: 1.0


In [20]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Initialize the XGBoost classifier
xgb = XGBClassifier(eval_metric='logloss')  # 'eval_metric' set to 'logloss' for binary classification

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'f1': 'f1'
}

# Perform cross-validation
cv_results = cross_validate(xgb, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True)

# Print cross-validated scores
print("Cross-validated Accuracy scores:", cv_results['test_accuracy'])
print("Cross-validated ROC AUC scores:", cv_results['test_roc_auc'])
print("Cross-validated F1 scores:", cv_results['test_f1'])
print("Average Accuracy:", cv_results['test_accuracy'].mean())
print("Average ROC AUC:", cv_results['test_roc_auc'].mean())
print("Average F1 score:", cv_results['test_f1'].mean())

# Fit the model on the entire training set
xgb.fit(X, y)

# Predict on the test set (replace X_test with your test data if separate)
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)[:, 1]

# Calculate and display additional evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_score)


Cross-validated Accuracy scores: [1. 1. 1. 1. 1.]
Cross-validated ROC AUC scores: [1. 1. 1. 1. 1.]
Cross-validated F1 scores: [1. 1. 1. 1. 1.]
Average Accuracy: 1.0
Average ROC AUC: 1.0
Average F1 score: 1.0
Confusion Matrix:
 [[191880      0]
 [ 52662    158]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88    191880
           1       1.00      0.00      0.01     52820

    accuracy                           0.78    244700
   macro avg       0.89      0.50      0.44    244700
weighted avg       0.83      0.78      0.69    244700

AUC-ROC Score: 0.5041177584248391


In [22]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Instantiate XGBClassifier
xgb = XGBClassifier(
    scale_pos_weight=191880/52820,  # Adjust for class imbalance
    use_label_encoder=False,  # Suppress unnecessary warning
    eval_metric='logloss'     # Set evaluation metric
)

# Train the model
xgb.fit(X_train, y_train)

# Predictions and Evaluation
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

# AUC-ROC Score
roc_auc = roc_auc_score(y_test, y_proba_xgb)
print("AUC-ROC Score:", roc_auc)


Parameters: { "use_label_encoder" } are not used.



Confusion Matrix:
 [[191880      0]
 [     0  52820]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    191880
           1       1.00      1.00      1.00     52820

    accuracy                           1.00    244700
   macro avg       1.00      1.00      1.00    244700
weighted avg       1.00      1.00      1.00    244700

AUC-ROC Score: 1.0
