In [23]:
import pandas as pd
train_df=pd.read_csv("merged_features_drop_cleaned_train_data.csv")
test_df=pd.read_csv("merged_features_drop_cleaned_test_data.csv")


In [25]:
# For demonstration, using forward fill. Adjust based on your data's characteristics.
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)


  train_df.fillna(method='ffill', inplace=True)
  test_df.fillna(method='ffill', inplace=True)


In [29]:
train_df.isnull().sum()

status                          0
departure.iataCode              0
departure.icaoCode              0
departure.terminal              0
departure.scheduledTime         0
departure.estimatedTime         0
departure.actualTime            0
departure.estimatedRunway       0
departure.actualRunway          0
arrival.scheduledTime           0
arrival.estimatedTime           0
airline.iataCode                0
airline.icaoCode                0
flight.number                   0
flight.iataNumber               0
flight.icaoNumber               0
arrival.actualTime              0
arrival.estimatedRunway      1028
arrival.actualRunway            0
departure.delay_minutes         0
departure.hour_of_day           0
departure.month                 0
status_encoded                  0
Departure_Date                  0
Departure_Month                 0
Departure_Day                   0
Departure_Hour                  0
Month                           0
Day                             0
Temperature (°

In [None]:
datetime_cols = [
    'departure.estimatedTime',
    'departure.actualTime']

for col in datetime_cols:
    train_df[col] = pd.to_datetime(train_df[col], errors='coerce')
    test_df[col] = pd.to_datetime(test_df[col], errors='coerce')

# After conversion, handle any new missing values introduced by errors='coerce'
train_df.fillna(method='ffill', inplace=True)
#test_df.fillna(method='ffill', inplace=True)


In [None]:
import pandas as pd
import numpy as np

# For preprocessing and modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix,
                             mean_absolute_error, mean_squared_error)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# ==============================================
# PHASE 3: Analytical and Predictive Tasks
# ==============================================

# 1. Load Data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 2. Data Preprocessing

# Function to preprocess data
def preprocess_data(train, test):
    # Handle missing values
    train.fillna(method='ffill', inplace=True)
    test.fillna(method='ffill', inplace=True)
    
    # Convert object time columns to datetime
    datetime_cols = [
        'departure.scheduledTime',
        'departure.estimatedTime',
        'departure.actualTime',
        'departure.estimatedRunway',
        'departure.actualRunway',
        'arrival.scheduledTime',
        'arrival.estimatedTime',
        'arrival.actualTime',
        'arrival.estimatedRunway',
        'arrival.actualRunway'
    ]
    
    for col in datetime_cols:
        # Convert to datetime, coerce errors to NaT
        train[col] = pd.to_datetime(train[col], errors='coerce')
        test[col] = pd.to_datetime(test[col], errors='coerce')
    
    # After conversion, handle any new missing values
    train.fillna(method='ffill', inplace=True)
    test.fillna(method='ffill', inplace=True)
    
    # Feature Engineering: Extract datetime features
    def extract_datetime_features(df, datetime_columns):
        for col in datetime_columns:
            df[f"{col}_hour"] = df[col].dt.hour
            df[f"{col}_minute"] = df[col].dt.minute
            df[f"{col}_dayofweek"] = df[col].dt.dayofweek
            df[f"{col}_day"] = df[col].dt.day
            df[f"{col}_month"] = df[col].dt.month
            df[f"{col}_year"] = df[col].dt.year
        return df
    
    train = extract_datetime_features(train, datetime_cols)
    test = extract_datetime_features(test, datetime_cols)
    
    # Drop original datetime columns if not needed
    train.drop(columns=datetime_cols, inplace=True)
    test.drop(columns=datetime_cols, inplace=True)
    
    # Encode categorical variables if necessary
    # Identify categorical and numerical features
    categorical_features = train.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_features.remove('delay_status')  # Assuming 'delay_status' is similar to target
    categorical_features.remove('delay_category')  # Will be used for multi-class
    categorical_features.remove('delay')  # If present
    categorical_features = [col for col in categorical_features if col not in ['binary_target', 'multi_target']]
    
    numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_features.remove('departure.delay_minutes')  # Target for regression
    
    # Remove target variables from features
    X_train = train.drop(['departure.delay_minutes'], axis=1)
    X_test = test.copy()
    
    return X_train, test, numerical_features, categorical_features

# Preprocess the data
X_train_full, X_test_full, numerical_features, categorical_features = preprocess_data(train_df, test_df)

# ==============================================
# 3. Define Target Variables
# ==============================================

# Binary Classification: 'on-time' vs 'delayed'
X_train_full['binary_target'] = X_train_full['departure.delay_minutes'].apply(lambda x: 'on-time' if x == 0 else 'delayed')

# Multi-Class Classification: No Delay, Short Delay, Moderate Delay, Long Delay
def categorize_delay(x):
    if x == 0:
        return 'No Delay'
    elif x < 45:
        return 'Short Delay'
    elif 45 <= x <= 175:
        return 'Moderate Delay'
    else:
        return 'Long Delay'

X_train_full['multi_target'] = X_train_full['departure.delay_minutes'].apply(categorize_delay)

# Regression Target: 'departure.delay_minutes'
y_reg = X_train_full['departure.delay_minutes']

# Prepare features for Binary and Multi-Class Classification
X_bin = X_train_full.drop(['departure.delay_minutes', 'multi_target'], axis=1)
y_bin = X_train_full['binary_target']

X_multi = X_train_full.drop(['departure.delay_minutes', 'binary_target'], axis=1)
y_multi = X_train_full['multi_target']

# ==============================================
# 4. Define Preprocessing Pipelines
# ==============================================

# Preprocessor for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# ==============================================
# 5. Binary Classification
# ==============================================

# Train-Test Split for Binary Classification
X_train_bin, X_val_bin, y_train_bin, y_val_bin = train_test_split(
    X_bin, y_bin, test_size=0.2, random_state=42, stratify=y_bin)

# Define Binary Classification Pipeline
binary_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the Binary Classification Model
binary_clf_pipeline.fit(X_train_bin, y_train_bin)

# Predict on Validation Set
y_pred_bin = binary_clf_pipeline.predict(X_val_bin)

# Evaluate Binary Classification
print("=== Binary Classification Metrics ===")
print("Accuracy:", accuracy_score(y_val_bin, y_pred_bin))
print("Precision:", precision_score(y_val_bin, y_pred_bin, pos_label='delayed'))
print("Recall:", recall_score(y_val_bin, y_pred_bin, pos_label='delayed'))
print("F1-Score:", f1_score(y_val_bin, y_pred_bin, pos_label='delayed'))
print("Confusion Matrix:\n", confusion_matrix(y_val_bin, y_pred_bin, labels=['on-time', 'delayed']))
print("Classification Report:\n", classification_report(y_val_bin, y_pred_bin))

# ==============================================
# 6. Multi-Class Classification
# ==============================================

# Train-Test Split for Multi-Class Classification
X_train_multi, X_val_multi, y_train_multi, y_val_multi = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42, stratify=y_multi)

# Define Multi-Class Classification Pipeline
multi_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the Multi-Class Classification Model
multi_clf_pipeline.fit(X_train_multi, y_train_multi)

# Predict on Validation Set
y_pred_multi = multi_clf_pipeline.predict(X_val_multi)

# Evaluate Multi-Class Classification
print("=== Multi-Class Classification Metrics ===")
print("Accuracy:", accuracy_score(y_val_multi, y_pred_multi))
print("Classification Report:\n", classification_report(y_val_multi, y_pred_multi))
print("Confusion Matrix:\n", confusion_matrix(y_val_multi, y_pred_multi))

# ==============================================
# 7. Regression Analysis
# ==============================================

# Train-Test Split for Regression
X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(
    X_train_full.drop(['binary_target', 'multi_target'], axis=1),
    y_reg,
    test_size=0.2,
    random_state=42
)

# Define Regression Pipeline
reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train the Regression Model
reg_pipeline.fit(X_train_reg, y_train_reg)

# Predict on Validation Set
y_pred_reg = reg_pipeline.predict(X_val_reg)

# Evaluate Regression
mae = mean_absolute_error(y_val_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_val_reg, y_pred_reg))
print("=== Regression Metrics ===")
print("MAE:", mae)
print("RMSE:", rmse)

# ==============================================
# PHASE 4: Model Optimization and Evaluation
# ==============================================

# 1. Hyperparameter Tuning for Binary Classification
param_grid_bin = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search_bin = GridSearchCV(binary_clf_pipeline, param_grid_bin, cv=5, scoring='f1', n_jobs=-1, verbose=2)
grid_search_bin.fit(X_train_bin, y_train_bin)

print("=== Best Parameters for Binary Classification ===")
print(grid_search_bin.best_params_)

best_binary_model = grid_search_bin.best_estimator_

# Evaluate Best Binary Model
y_pred_bin_best = best_binary_model.predict(X_val_bin)
print("=== Binary Classification Metrics After Hyperparameter Tuning ===")
print("Accuracy:", accuracy_score(y_val_bin, y_pred_bin_best))
print("Precision:", precision_score(y_val_bin, y_pred_bin_best, pos_label='delayed'))
print("Recall:", recall_score(y_val_bin, y_pred_bin_best, pos_label='delayed'))
print("F1-Score:", f1_score(y_val_bin, y_pred_bin_best, pos_label='delayed'))
print("Confusion Matrix:\n", confusion_matrix(y_val_bin, y_pred_bin_best, labels=['on-time', 'delayed']))
print("Classification Report:\n", classification_report(y_val_bin, y_pred_bin_best))

# 2. K-Fold Cross-Validation for Regression
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_reg = cross_val_score(reg_pipeline, X_train_reg, y_train_reg, cv=kf, scoring='neg_mean_squared_error')
rmse_scores_reg = np.sqrt(-cv_scores_reg)
print("=== Regression Cross-Validation RMSE ===")
print("Scores:", rmse_scores_reg)
print("Mean RMSE:", rmse_scores_reg.mean())

# 3. Model Comparison (Example: Random Forest vs. Logistic Regression vs. SVM for Binary Classification)

# Define multiple models
models = {
    'RandomForest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ]),
    'LogisticRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ]),
    'SVM': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC(random_state=42, probability=True))
    ]),
    'DecisionTree': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ]),
    'KNN': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier())
    ]),
    'NaiveBayes': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', GaussianNB())
    ])
}

# Evaluate each model using cross-validation
for name, model in models.items():
    cv_scores = cross_val_score(model, X_bin, y_bin, cv=5, scoring='f1', n_jobs=-1)
    print(f"{name} F1 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Choose the best performing model based on cross-validation scores
# For demonstration, assume RandomForest performed best

# ==============================================
# PHASE 5: Model Testing and Submission
# ==============================================

# 1. Make Predictions on the Test Dataset

# Prepare the test set features (ensure same preprocessing)
test_X = X_test_full.drop(['departure.delay_minutes', 'binary_target', 'multi_target'], axis=1, errors='ignore')

# Binary Classification Prediction using best_binary_model
test_pred_bin = best_binary_model.predict(test_X)

# Multi-Class Classification Prediction (if you have optimized multi-class model)
# Assuming you have a best_multi_model (similar to best_binary_model)
# For this example, we'll use the previously trained multi_clf_pipeline
test_pred_multi = multi_clf_pipeline.predict(test_X)

# Regression Prediction using reg_pipeline
test_pred_reg = reg_pipeline.predict(test_X)

# 2. Save Predictions in Kaggle Submission Format

# Define a function to create submission files
def create_submission(test_df, predictions, submission_type='binary'):
    # Ensure that the necessary columns are present in test_df
    required_columns = [
        'File Name', 'Flight Number', 'Type', 'Status',
        'Departure IATA Code', 'Departure ICAO Code',
        'Scheduled Time', 'Arrival IATA Code',
        'Arrival ICAO Code', 'Arrival Estimated Time'
    ]
    
    # Check if required columns exist
    for col in required_columns:
        if col not in test_df.columns:
            raise ValueError(f"Missing required column: {col}")
    
    submission = test_df[required_columns].copy()
    
    if submission_type == 'binary':
        # Convert predictions to string format
        submission['Delay'] = predictions
    elif submission_type == 'multi':
        submission['Delay'] = predictions
    elif submission_type == 'regression':
        # Convert delay minutes to integer or keep as float based on Kaggle requirements
        submission['Delay'] = predictions.round().astype(int)
    else:
        raise ValueError("Invalid submission type. Choose from 'binary', 'multi', or 'regression'.")
    
    return submission

# Create Binary Classification Submission
submission_bin = create_submission(test_df, test_pred_bin, submission_type='binary')
submission_bin.to_csv('submission_binary_classification.csv', index=False)
print("Binary Classification submission file created: 'submission_binary_classification.csv'")

# Create Multi-Class Classification Submission
submission_multi = create_submission(test_df, test_pred_multi, submission_type='multi')
submission_multi.to_csv('submission_multi_classification.csv', index=False)
print("Multi-Class Classification submission file created: 'submission_multi_classification.csv'")

# Create Regression Submission
submission_reg = create_submission(test_df, test_pred_reg, submission_type='regression')
submission_reg.to_csv('submission_regression.csv', index=False)
print("Regression submission file created: 'submission_regression.csv'")

# ==============================================
# Additional: Feature Importance Plot (Optional)
# ==============================================

# Plot feature importances for Binary Classification
def plot_feature_importances(model, preprocessor, numerical_features, categorical_features, top_n=20):
    # Get feature names after preprocessing
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    ohe_features = ohe.get_feature_names_out(categorical_features)
    feature_names = numerical_features + list(ohe_features)
    
    # Get feature importances from the model
    importances = model.named_steps['classifier'].feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Plot
    plt.figure(figsize=(10, 8))
    sns.barplot(x=importances[indices][:top_n], y=np.array(feature_names)[indices][:top_n])
    plt.title(f"Top {top_n} Feature Importances")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()

# Example: Plot feature importances for best_binary_model
plot_feature_importances(best_binary_model, preprocessor, numerical_features, categorical_features, top_n=20)

# ==============================================
# END OF SCRIPT
# ==============================================


In [16]:
test_df

Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.scheduledTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,arrival.estimatedTime,airline.iataCode,airline.icaoCode,flight.number,flight.iataNumber,flight.icaoNumber,departure.terminal,arrival.actualTime,arrival.estimatedRunway,arrival.actualRunway,departure.hour_of_day,departure.month,status_encoded,departure.day_of_week_Monday,departure.day_of_week_Saturday,departure.day_of_week_Sunday,departure.day_of_week_Thursday,departure.day_of_week_Tuesday,departure.day_of_week_Wednesday,airline.name_air arabia,airline.name_air canada,airline.name_air china ltd,airline.name_air france,airline.name_air mauritius,airline.name_airact,airline.name_airblue,airline.name_airsial,airline.name_alitalia,airline.name_american airlines,airline.name_ariana afghan airlines,airline.name_asiana airlines,airline.name_azal azerbaijan airlines,airline.name_batik air,airline.name_british airways,airline.name_cham wings airlines,airline.name_china southern airlines,airline.name_danish air,airline.name_egyptair,airline.name_emirates,airline.name_empty,airline.name_ethiopian airlines,airline.name_etihad airways,airline.name_evelop airlines,airline.name_fai rent-a-jet,airline.name_fitsair,airline.name_fly baghdad,airline.name_flydubai,airline.name_flyjinnah,airline.name_flynas,airline.name_georgian airlines,airline.name_gulf air,airline.name_harmony jets,airline.name_hi fly,...,airline.name_klm,airline.name_kuwait airways,airline.name_lion air,airline.name_mahan air,airline.name_malaysia airlines,airline.name_maleth-aero,airline.name_malindo air,airline.name_nomadic aviation,airline.name_oman air,airline.name_pakistan international airlines,airline.name_pegasus,airline.name_privilege style,airline.name_qatar airways,airline.name_redstar aviation,airline.name_royal air maroc,airline.name_royal jordanian,airline.name_rwandair,airline.name_salamair,airline.name_saudia,airline.name_scat airlines,airline.name_serene air,airline.name_silk way airlines,airline.name_smartlynx airlines,airline.name_smartwings,airline.name_somon air,airline.name_srilankan airlines,airline.name_swiss air-ambulance,airline.name_thai airways international,airline.name_turkish airlines,airline.name_uls airlines cargo,airline.name_virgin australia,airline.name_yto cargo airlines,Departure_Date,Departure_Month,Departure_Day,Month,Day,Temperature (°F) Max,Temperature (°F) Avg,Temperature (°F) Min,Dew Point (°F) Max,Dew Point (°F) Avg,Dew Point (°F) Min,Humidity (%) Max,Humidity (%) Avg,Humidity (%) Min,Wind Speed (mph) Max,Wind Speed (mph) Avg,Wind Speed (mph) Min,Pressure (in) Max,Pressure (in) Avg,Pressure (in) Min,Precipitation (in) Total,Departure_Hour,Departure_DayOfWeek_Monday,Departure_DayOfWeek_Saturday,Departure_DayOfWeek_Sunday,Departure_DayOfWeek_Thursday,Departure_DayOfWeek_Tuesday,Departure_DayOfWeek_Wednesday
0,0,lhe,opla,2023-07-17 20:35:00,2023-07-17 20:46:00,2023-07-17 20:46:00,2023-07-17 22:20:00,2023-07-17 22:12:00,9p,fjl,847,9p847,fjl847,Unknown,2023-07-17 22:12:00,,2023-07-17 22:20:00,20,7,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Jul 17,Jul,17,Jul,17.0,95.0,88.6,82.0,81.0,78.1,75.0,89.0,71.3,59.0,12.0,6.9,0.0,28.8,28.8,28.7,0.0,20,True,False,False,False,False,False
1,0,lhe,opla,2023-07-27 08:00:00,2023-07-17 20:46:00,2023-07-27 08:00:00,2023-07-27 10:00:00,2023-07-17 22:12:00,pk,pia,725,pk725,pia725,m,2023-07-27 10:00:00,,2023-07-27 10:00:00,8,7,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Jul 27,Jul,27,Jul,27.0,88.0,82.9,75.0,79.0,77.1,73.0,100.0,83.7,70.0,16.0,7.5,0.0,28.9,28.9,28.8,0.0,8,False,False,False,True,False,False
2,0,lhe,opla,2023-07-27 08:00:00,2023-07-17 20:46:00,2023-07-27 08:00:00,2023-07-27 10:00:00,2023-07-17 22:12:00,et,eth,4359,et4359,eth4359,m,2023-07-27 10:00:00,,2023-07-27 10:00:00,8,7,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Jul 27,Jul,27,Jul,27.0,88.0,82.9,75.0,79.0,77.1,73.0,100.0,83.7,70.0,16.0,7.5,0.0,28.9,28.9,28.8,0.0,8,False,False,False,True,False,False
3,2,lhe,opla,2023-07-28 16:45:00,2023-07-17 20:46:00,2023-07-28 16:45:00,2023-07-28 20:30:00,2023-07-17 22:12:00,pa,abq,470,pa470,abq470,m,2023-07-28 20:30:00,,2023-07-28 20:30:00,16,7,2,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Jul 28,Jul,28,Jul,28.0,93.0,86.3,81.0,79.0,76.7,75.0,89.0,73.8,59.0,13.0,7.4,0.0,28.8,28.8,28.7,0.0,16,False,False,False,False,False,False
4,0,lhe,opla,2023-07-19 04:15:00,2023-07-19 04:18:00,2023-07-19 04:18:00,2023-07-19 06:35:00,2023-07-19 06:08:00,kl,klm,3932,kl3932,klm3932,m,2023-07-19 06:08:00,,2023-07-19 06:35:00,4,7,0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Jul 19,Jul,19,Jul,19.0,90.0,84.4,81.0,81.0,79.8,77.0,100.0,85.3,75.0,23.0,11.3,0.0,28.8,28.8,28.7,0.0,4,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14977,0,lhe,opla,2023-11-23 23:15:00,2023-11-23 23:27:00,2023-11-23 23:27:00,2023-11-24 02:05:00,2023-11-24 01:34:00,pa,abq,412,pa412,abq412,m,2023-11-24 01:34:00,2023-11-24 00:34:00,2023-11-24 02:05:00,23,11,0,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Nov 23,Nov,23,Nov,23.0,75.0,63.0,0.0,57.0,52.9,0.0,88.0,69.3,0.0,6.0,0.7,0.0,29.2,28.6,0.0,0.0,23,False,False,False,True,False,False
14978,0,lhe,opla,2023-11-25 18:45:00,2023-11-25 19:05:00,2023-11-25 19:05:00,2023-11-25 22:05:00,2023-11-25 22:09:00,pk,pia,859,pk859,pia859,m,2023-11-25 22:09:00,2023-11-24 00:34:00,2023-11-25 22:05:00,18,11,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Nov 25,Nov,25,Nov,25.0,73.0,63.0,54.0,54.0,52.2,48.0,88.0,69.2,44.0,8.0,1.0,0.0,29.3,29.3,29.2,0.0,18,False,True,False,False,False,False
14979,0,lhe,opla,2023-11-25 02:55:00,2023-11-25 03:07:00,2023-11-25 03:07:00,2023-11-25 05:05:00,2023-11-25 04:33:00,ba,baw,6187,ba6187,baw6187,m,2023-11-25 04:33:00,2023-11-24 00:34:00,2023-11-25 05:05:00,2,11,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Nov 25,Nov,25,Nov,25.0,73.0,63.0,54.0,54.0,52.2,48.0,88.0,69.2,44.0,8.0,1.0,0.0,29.3,29.3,29.2,0.0,2,False,True,False,False,False,False
14980,0,lhe,opla,2023-11-27 08:35:00,2023-11-25 03:07:00,2023-11-27 08:35:00,2023-11-27 10:45:00,2023-11-25 04:33:00,wy,oma,6032,wy6032,oma6032,m,2023-11-27 10:45:00,2023-11-24 00:34:00,2023-11-27 10:45:00,8,11,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,Nov 27,Nov,27,Nov,27.0,70.0,62.7,55.0,55.0,54.3,52.0,88.0,74.4,60.0,12.0,2.3,0.0,29.3,29.3,29.2,0.0,8,True,False,False,False,False,False


In [62]:
import pandas as pd 

# Load datasets (train and test data)
train_df = pd.read_csv('merged_features_drop_cleaned_train_data.csv')
test_df = pd.read_csv('merged_features_drop_cleaned_test_data.csv')

print(train_df)
print(test_df)
print(train_df.info())
print(test_df.info())
print(train_df.isnull().sum())
print(test_df.isnull().sum())


       status departure.iataCode  ... departure.actualTime_month departure.actualTime_year
0           0                lhe  ...                          7                      2023
1           0                lhe  ...                          7                      2023
2           0                lhe  ...                          7                      2023
3           0                lhe  ...                          7                      2023
4           0                lhe  ...                          7                      2023
...       ...                ...  ...                        ...                       ...
51866       0                lhe  ...                         11                      2023
51867       0                lhe  ...                         11                      2023
51868       0                lhe  ...                         11                      2023
51869       0                lhe  ...                         11                      2023

In [3]:
train_df=pd.read_csv('merged_features_drop_cleaned_train_data.csv')
test_df=pd.read_csv('merged_features_drop_cleaned_test_data.csv')


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression



##############################################
# PHASE 3: Analytical and Predictive Tasks
##############################################

# Load your train and test datasets
# train_df and test_df should be preprocessed (handle missing values, convert datetimes, feature engineering, etc.)
# Ensure that columns in test_df are in the same format as train_df.
# Assume that 'departure.delay_minutes' is present in train_df and needs to be predicted or classified.
#train_df = pd.read_csv("train.csv")
#test_df = pd.read_csv("test.csv")

In [7]:


# Example feature selection (adjust according to your dataset)
# Let's assume we've already chosen a set of relevant numerical and categorical features.
numeric_features = ['Temperature (°F) Avg', 'Humidity (%) Avg', 'Wind Speed (mph) Avg', 'Pressure (in) Avg']
categorical_features = ['departure.iataCode', 'departure.icaoCode', 'airline.iataCode']

# Common preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])



### 1. Binary Classification ###
# Binary Criteria:
# delay = 0 -> on-time
# delay > 0 -> delayed


In [9]:


train_df['binary_target'] = train_df['departure.delay_minutes'].apply(lambda x: 'on-time' if x == 0 else 'delayed')
X_bin = train_df[numeric_features + categorical_features]
y_bin = train_df['binary_target']

X_train_bin, X_val_bin, y_train_bin, y_val_bin = train_test_split(X_bin, y_bin, test_size=0.2, random_state=42, stratify=y_bin)

binary_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

binary_clf_pipeline.fit(X_train_bin, y_train_bin)
y_pred_bin = binary_clf_pipeline.predict(X_val_bin)



In [11]:
# Evaluation for Binary Classification
acc = accuracy_score(y_val_bin, y_pred_bin)
prec = precision_score(y_val_bin, y_pred_bin, pos_label='delayed')
rec = recall_score(y_val_bin, y_pred_bin, pos_label='delayed')
f1 = f1_score(y_val_bin, y_pred_bin, pos_label='delayed')
cm = confusion_matrix(y_val_bin, y_pred_bin, labels=['on-time', 'delayed'])

print("Binary Classification Metrics:")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-Score:", f1)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", classification_report(y_val_bin, y_pred_bin))


Binary Classification Metrics:
Accuracy: 0.7332048192771085
Precision: 0.7873000716161375
Recall: 0.8699551569506726
F1-Score: 0.8265664160401003
Confusion Matrix:
 [[1011 1782]
 [ 986 6596]]
Classification Report:
               precision    recall  f1-score   support

     delayed       0.79      0.87      0.83      7582
     on-time       0.51      0.36      0.42      2793

    accuracy                           0.73     10375
   macro avg       0.65      0.62      0.62     10375
weighted avg       0.71      0.73      0.72     10375



# Xgboost

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Assuming 'train_df', 'numeric_features', and 'categorical_features' are already defined

# Convert the 'binary_target' to numeric (0: 'on-time', 1: 'delayed')
train_df['binary_target'] = train_df['departure.delay_minutes'].apply(lambda x: 0 if x == 0 else 1)

# Define the features and target
X_bin = train_df[numeric_features + categorical_features]
y_bin = train_df['binary_target']

# Split into train and validation sets
X_train_bin, X_val_bin, y_train_bin, y_val_bin = train_test_split(X_bin, y_bin, test_size=0.2, random_state=42, stratify=y_bin)

# Define the preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Scale numeric features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Impute missing values with a placeholder
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine the transformations using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the pipeline with XGBClassifier
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Fit the model
xgb_pipeline.fit(X_train_bin, y_train_bin)

# Make predictions
y_pred_bin = xgb_pipeline.predict(X_val_bin)

# Evaluate the model
acc = accuracy_score(y_val_bin, y_pred_bin)
prec = precision_score(y_val_bin, y_pred_bin)
rec = recall_score(y_val_bin, y_pred_bin)
f1 = f1_score(y_val_bin, y_pred_bin)
cm = confusion_matrix(y_val_bin, y_pred_bin)

# Print the evaluation metrics
print("XGBoost Binary Classification Metrics:")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-Score:", f1)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", classification_report(y_val_bin, y_pred_bin))


XGBoost Binary Classification Metrics:
Accuracy: 0.7661686746987951
Precision: 0.781072830353249
Recall: 0.9448694275916645
F1-Score: 0.8551987585054315
Confusion Matrix:
 [[ 785 2008]
 [ 418 7164]]
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.28      0.39      2793
           1       0.78      0.94      0.86      7582

    accuracy                           0.77     10375
   macro avg       0.72      0.61      0.62     10375
weighted avg       0.75      0.77      0.73     10375



In [142]:
train_df

Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,...,Pressure (in) Max,Pressure (in) Avg,Pressure (in) Min,Precipitation (in) Total,delay_status,delay_category,airline.name,departure.day_of_week,binary_target,multi_target
0,0,lhe,opla,m,2023-07-20 20:50:00,2023-07-20 20:00:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 23:20:00,...,28.9,28.8,28.7,0.0,delayed,,saudia,3,1,Short Delay
1,0,lhe,opla,Unknown,2023-07-18 15:05:00,2023-07-18 15:05:00,2023-07-18 15:05:00,2023-07-20 20:15:00,2023-07-18 15:05:00,2023-07-18 16:50:00,...,28.8,28.8,28.7,0.0,on-time,No Delay,flyjinnah,1,0,No Delay
2,0,lhe,opla,Unknown,2023-07-23 09:50:00,2023-07-18 15:05:00,2023-07-23 09:50:00,2023-07-20 20:15:00,2023-07-23 09:50:00,2023-07-23 11:35:00,...,29.0,28.9,28.8,0.0,on-time,No Delay,flyjinnah,6,0,No Delay
3,0,lhe,opla,m,2023-07-26 23:30:00,2023-07-26 23:30:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-27 01:30:00,...,28.9,28.9,28.8,0.0,delayed,Short Delay,pakistan international airlines,2,1,Short Delay
4,0,lhe,opla,m,2023-07-20 11:35:00,2023-07-20 17:15:00,2023-07-20 17:15:00,2023-07-26 23:51:00,2023-07-20 11:35:00,2023-07-20 14:00:00,...,28.9,28.8,28.7,0.0,delayed,Long Delay,serene air,3,1,Long Delay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51866,0,lhe,opla,Unknown,2023-11-16 09:50:00,2023-11-16 09:50:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 11:45:00,...,29.4,29.3,29.3,0.0,delayed,Short Delay,flyjinnah,3,1,Short Delay
51867,0,lhe,opla,m,2023-11-16 11:40:00,2023-11-16 11:40:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 15:25:00,...,29.4,29.3,29.3,0.0,delayed,Short Delay,saudia,3,1,Short Delay
51868,0,lhe,opla,m,2023-11-27 10:50:00,2023-11-27 14:35:00,2023-11-27 14:35:00,2023-11-16 11:48:00,2023-11-27 10:50:00,2023-11-27 13:30:00,...,29.3,29.3,29.2,0.0,delayed,Long Delay,pakistan international airlines,0,1,Long Delay
51869,0,lhe,opla,m,2023-11-19 02:00:00,2023-11-19 11:00:00,2023-11-19 11:00:00,2023-11-16 11:48:00,2023-11-19 02:00:00,2023-11-19 10:30:00,...,29.4,28.7,0.0,0.0,delayed,Long Delay,pakistan international airlines,6,1,Long Delay



### 2. Multi-Class Classification ###
# Categories:
# No Delay (0)
# Short Delay (<45)
# Moderate Delay (45–175)
# Long Delay (>175)

In [13]:


def categorize_delay(x):
    if x == 0:
        return 'No Delay'
    elif x < 45:
        return 'Short Delay'
    elif 45 <= x <= 175:
        return 'Moderate Delay'
    else:
        return 'Long Delay'

train_df['multi_target'] = train_df['departure.delay_minutes'].apply(categorize_delay)
X_multi = train_df[numeric_features + categorical_features]
y_multi = train_df['multi_target']

X_train_multi, X_val_multi, y_train_multi, y_val_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42, stratify=y_multi)

multi_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])




In [15]:
multi_clf_pipeline.fit(X_train_multi, y_train_multi)
y_pred_multi = multi_clf_pipeline.predict(X_val_multi)

# Evaluation for Multi-Class Classification
acc_multi = accuracy_score(y_val_multi, y_pred_multi)
print("Multi-Class Classification Accuracy:", acc_multi)
print("Classification Report:\n", classification_report(y_val_multi, y_pred_multi))
print("Confusion Matrix:\n", confusion_matrix(y_val_multi, y_pred_multi))

### 3. Regression Analysis ###
# Predict exact delay duration
X_reg = train_df[numeric_features + categorical_features]
y_reg = train_df['departure.delay_minutes']

X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

reg_pipeline.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_pipeline.predict(X_val_reg)

# Evaluation for Regression
mae = mean_absolute_error(y_val_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_val_reg, y_pred_reg))

print("Regression Metrics:")
print("MAE:", mae)
print("RMSE:", rmse)

Multi-Class Classification Accuracy: 0.6466506024096386
Classification Report:
                 precision    recall  f1-score   support

    Long Delay       0.06      0.03      0.04       229
Moderate Delay       0.15      0.05      0.08       625
      No Delay       0.47      0.40      0.43      2793
   Short Delay       0.72      0.82      0.77      6728

      accuracy                           0.65     10375
     macro avg       0.35      0.33      0.33     10375
  weighted avg       0.61      0.65      0.62     10375

Confusion Matrix:
 [[   6   13   83  127]
 [  11   33  166  415]
 [  34   58 1120 1581]
 [  43  121 1014 5550]]
Regression Metrics:
MAE: 27.49247157812209
RMSE: 68.74128282085401



##############################################
# PHASE 4: Model Optimization and Evaluation
##############################################

# Hyperparameter Tuning using GridSearch for binary classification

In [21]:

##############################################
# PHASE 4: Model Optimization and Evaluation
##############################################

# Example Hyperparameter Tuning using GridSearch for binary classification
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}

grid_search_bin = GridSearchCV(binary_clf_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_bin.fit(X_train_bin, y_train_bin)

print("Best Params for Binary Classification:", grid_search_bin.best_params_)

best_binary_model = grid_search_bin.best_estimator_

# Apply K-Fold Cross-Validation for regression
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_reg = cross_val_score(reg_pipeline, X_reg, y_reg, cv=kf, scoring='neg_mean_squared_error')
print("Regression CV RMSE:", np.sqrt(-cv_scores_reg).mean())

# You can repeat similar steps for multi-class classification or other models.



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params for Binary Classification: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Regression CV RMSE: 70.64445365969823


In [172]:
test_df['binary_target'] = test_df['departure.delay_minutes'].apply(lambda x: 0 if x == 0 else 1)

#  Define the parameter grid for multi-class classification


In [22]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for multi-class classification
param_grid_multi = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Initialize GridSearchCV for multi-class classification
grid_search_multi = GridSearchCV(
    multi_clf_pipeline,  # Your existing pipeline for multi-class
    param_grid_multi,
    cv=5,
    scoring='f1_macro',  # Use 'f1_macro' for multi-class
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
grid_search_multi.fit(X_train_multi, y_train_multi)

# Output the best parameters
print("Best Params for Multi-Class Classification:", grid_search_multi.best_params_)

# Extract the best estimator
best_multi_model = grid_search_multi.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params for Multi-Class Classification: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


In [23]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation with the best multi-class model
cv_scores_multi = cross_val_score(
    best_multi_model,
    X_multi,
    y_multi,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

# Calculate and print the mean and standard deviation of F1 scores
print("Multi-Class Classification CV F1 Scores:", cv_scores_multi)
print("Mean CV F1 Score:", cv_scores_multi.mean())
print("Standard Deviation of CV F1 Scores:", cv_scores_multi.std())


Multi-Class Classification CV F1 Scores: [0.23999497 0.26658131 0.2450343  0.16141845 0.2504014 ]
Mean CV F1 Score: 0.23268608749922456
Standard Deviation of CV F1 Scores: 0.03673606595780098


In [None]:
from sklearn.svm import SVC

# Define a new pipeline for SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42, probability=True))
])

# Define the parameter grid for SVM
param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

# Initialize GridSearchCV for SVM
grid_search_svm = GridSearchCV(
    svm_pipeline,
    param_grid_svm,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
grid_search_svm.fit(X_train_multi, y_train_multi)

# Output the best parameters
print("Best Params for SVM Multi-Class Classification:", grid_search_svm.best_params_)

# Extract the best estimator
best_svm_multi = grid_search_svm.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits


# Define a new pipeline for Gradient Boosting

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

# Define a new pipeline for Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7]
}

# Initialize GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(
    gb_pipeline,
    param_grid_gb,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
grid_search_gb.fit(X_train_multi, y_train_multi)

# Output the best parameters
print("Best Params for Gradient Boosting Multi-Class Classification:", grid_search_gb.best_params_)

# Extract the best estimator
best_gb_multi = grid_search_gb.best_estimator_


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Params for Gradient Boosting Multi-Class Classification: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 7, 'classifier__n_estimators': 200}


# cross validation scores

In [None]:
cv_scores_svm_multi = cross_val_score(
    best_svm_multi,
    X_multi,
    y_multi,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

print("SVM Multi-Class Classification CV F1 Scores:", cv_scores_svm_multi)
print("Mean CV F1 Score:", cv_scores_svm_multi.mean())
print("Standard Deviation of CV F1 Scores:", cv_scores_svm_multi.std())


In [25]:
cv_scores_gb_multi = cross_val_score(
    best_gb_multi,
    X_multi,
    y_multi,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

print("Gradient Boosting Multi-Class Classification CV F1 Scores:", cv_scores_gb_multi)
print("Mean CV F1 Score:", cv_scores_gb_multi.mean())
print("Standard Deviation of CV F1 Scores:", cv_scores_gb_multi.std())


Gradient Boosting Multi-Class Classification CV F1 Scores: [0.27256008 0.28049962 0.23911537 0.17843221 0.27039693]
Mean CV F1 Score: 0.24820084413144236
Standard Deviation of CV F1 Scores: 0.037628334362432826


In [26]:
# Dictionary to store model performances
model_performance = {
    'RandomForest': grid_search_multi.best_score_,
    'SVM': grid_search_svm.best_score_,
    'GradientBoosting': grid_search_gb.best_score_
}

# Display model performances
for model_name, score in model_performance.items():
    print(f"{model_name} Multi-Class F1 Score: {score:.4f}")


NameError: name 'grid_search_svm' is not defined

In [30]:
# Dictionary to store model performances
model_performance = {
    'RandomForest': grid_search_multi.best_score_,
    'GradientBoosting': grid_search_gb.best_score_
}

# Display model performances
for model_name, score in model_performance.items():
    print(f"{model_name} Multi-Class F1 Score: {score:.4f}")


RandomForest Multi-Class F1 Score: 0.3408
GradientBoosting Multi-Class F1 Score: 0.3456


# Define a new pipeline for KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier

# Define a new pipeline for KNN
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

# Define the parameter grid for KNN
param_grid_knn = {
    'classifier__n_neighbors': [5, 10, 15],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

# Initialize GridSearchCV for KNN
grid_search_knn = GridSearchCV(
    knn_pipeline,
    param_grid_knn,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
grid_search_knn.fit(X_train_multi, y_train_multi)

# Output the best parameters
print("Best Params for KNN Multi-Class Classification:", grid_search_knn.best_params_)

# Extract the best estimator
best_knn_multi = grid_search_knn.best_estimator_

# Cross-Validation for KNN
cv_scores_knn_multi = cross_val_score(
    best_knn_multi,
    X_multi,
    y_multi,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

print("KNN Multi-Class Classification CV F1 Scores:", cv_scores_knn_multi)
print("Mean CV F1 Score:", cv_scores_knn_multi.mean())
print("Standard Deviation of CV F1 Scores:", cv_scores_knn_multi.std())


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params for KNN Multi-Class Classification: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 5, 'classifier__weights': 'uniform'}
KNN Multi-Class Classification CV F1 Scores: [0.26125886 0.29312501 0.28596028 0.21052747 0.28140239]
Mean CV F1 Score: 0.2664548044433126
Standard Deviation of CV F1 Scores: 0.029900029432135426


In [None]:
# Example selection based on highest F1 Score
best_binary_model = grid_search_bin.best_estimator_
best_multi_model = grid_search_gb.best_estimator_  # Assuming Gradient Boosting performed best

# Optionally, you can keep multiple models for ensemble methods or further experimentation


In [34]:
# Example selection based on highest F1 Score
best_binary_model = grid_search_bin.best_estimator_
best_multi_model = grid_search_gb.best_estimator_  # Assuming Gradient Boosting performed best

# Optionally, keep other best models for ensemble methods or further experimentation
#best_svm_multi = grid_search_svm.best_estimator_
best_knn_multi = grid_search_knn.best_estimator_

# ==============================================
# Phase 5: Model Testing and Submission
# ==============================================


In [98]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14982 entries, 0 to 14981
Data columns (total 57 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   status                     14982 non-null  int64  
 1   departure.iataCode         14982 non-null  object 
 2   departure.icaoCode         14982 non-null  object 
 3   departure.scheduledTime    14982 non-null  object 
 4   departure.estimatedRunway  8730 non-null   object 
 5   departure.actualRunway     14982 non-null  object 
 6   arrival.scheduledTime      14982 non-null  object 
 7   arrival.estimatedTime      8429 non-null   object 
 8   airline.iataCode           14976 non-null  object 
 9   airline.icaoCode           14981 non-null  object 
 10  flight.number              14982 non-null  int64  
 11  flight.iataNumber          14976 non-null  object 
 12  flight.icaoNumber          14981 non-null  object 
 13  departure.terminal         14982 non-null  obj

In [65]:
print(test_df.info())
test_X.info()
test_X.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14982 entries, 0 to 14981
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   status                     14982 non-null  int64  
 1   departure.iataCode         14982 non-null  object 
 2   departure.icaoCode         14982 non-null  object 
 3   departure.scheduledTime    14982 non-null  object 
 4   departure.estimatedRunway  8730 non-null   object 
 5   departure.actualRunway     14982 non-null  object 
 6   arrival.scheduledTime      14982 non-null  object 
 7   arrival.estimatedTime      8429 non-null   object 
 8   airline.iataCode           14976 non-null  object 
 9   airline.icaoCode           14981 non-null  object 
 10  flight.number              14982 non-null  int64  
 11  flight.iataNumber          14976 non-null  object 
 12  flight.icaoNumber          14981 non-null  object 
 13  departure.terminal         14982 non-null  obj

status                           0
departure.iataCode               0
departure.icaoCode               0
departure.scheduledTime          0
departure.estimatedRunway     6252
departure.actualRunway           0
arrival.scheduledTime            0
arrival.estimatedTime         6553
airline.iataCode                 6
airline.icaoCode                 1
flight.number                    0
flight.iataNumber                6
flight.icaoNumber                1
departure.terminal               0
arrival.actualTime               0
arrival.estimatedRunway      14907
arrival.actualRunway             0
departure.hour_of_day            0
departure.month                  0
status_encoded                   0
Departure_Date                   0
Departure_Month                  0
Departure_Day                    0
Month                         4622
Day                           4622
Temperature (°F) Max             0
Temperature (°F) Avg             0
Temperature (°F) Min             0
Dew Point (°F) Max  

In [49]:
import pandas as pd
import numpy as np

# 1. Load the Test Dataset
test_df = pd.read_csv('merged_features_drop_cleaned_test_data.csv')

# 2. Prepare Test Features
test_X = test_df.drop(['departure.delay_minutes', 'binary_target', 'multi_target'], axis=1, errors='ignore')
test_X = test_X[numeric_features + categorical_features]

# 3. Create Required Columns for Submission

# a. Create 'File Name'
# Option 1: Using 'flight.number' as 'File Name'
test_df['File Name'] = test_df['flight.number'].astype(str)

# Option 2: Using a unique identifier
# test_df['File Name'] = 'Test_' + test_df.index.astype(str)

# b. Assign 'Flight Number'
# Using 'flight.iataNumber' if preferred
test_df['Flight Number'] = test_df['flight.iataNumber']

# c. Assign 'Type'
# Using 'departure.terminal' as an example; adjust based on competition's definition of 'Type'
test_df['Type'] = test_df['departure.terminal']

# d. Assign 'Status'
test_df['Status'] = test_df['status']

# e. Assign 'Departure IATA Code' and 'Departure ICAO Code'
test_df['Departure IATA Code'] = test_df['departure.iataCode']
test_df['Departure ICAO Code'] = test_df['departure.icaoCode']

# f. Assign 'Scheduled Time' and 'Arrival Estimated Time'
test_df['Scheduled Time'] = test_df['departure.scheduledTime']
test_df['Arrival Estimated Time'] = test_df['arrival.estimatedTime']

# g. Assign 'Arrival IATA Code' and 'Arrival ICAO Code'
# If missing, assign 'UNKNOWN' or derive them
if 'arrival.iataCode' in test_df.columns:
    test_df['Arrival IATA Code'] = test_df['arrival.iataCode']
else:
    test_df['Arrival IATA Code'] = 'UNKNOWN'

if 'arrival.icaoCode' in test_df.columns:
    test_df['Arrival ICAO Code'] = test_df['arrival.icaoCode']
else:
    test_df['Arrival ICAO Code'] = 'UNKNOWN'

# 4. Make Predictions Using Trained Models
# Ensure that the following models are already trained:
# - best_binary_model
# - best_multi_model
# - reg_pipeline

test_pred_bin = best_binary_model.predict(test_X)
test_pred_multi = best_multi_model.predict(test_X)
test_pred_reg = reg_pipeline.predict(test_X)

# 5. Create Submission Files


In [51]:

# Define the submission creation function as above

# a. Binary Classification Submission
try:
    submission_bin = create_submission(test_df, test_pred_bin, submission_type='binary')
    submission_bin.to_csv('submission_binary_classification.csv', index=False)
    print("Binary Classification submission file created: 'submission_binary_classification.csv'")
except Exception as e:
    print(f"Error creating Binary Classification submission: {e}")

# b. Multi-Class Classification Submission
try:
    submission_multi = create_submission(test_df, test_pred_multi, submission_type='multi')
    submission_multi.to_csv('submission_multi_classification.csv', index=False)
    print("Multi-Class Classification submission file created: 'submission_multi_classification.csv'")
except Exception as e:
    print(f"Error creating Multi-Class Classification submission: {e}")

# c. Regression Submission
try:
    submission_reg = create_submission(test_df, test_pred_reg, submission_type='regression')
    submission_reg.to_csv('submission_regression.csv', index=False)
    print("Regression submission file created: 'submission_regression.csv'")
except Exception as e:
    print(f"Error creating Regression submission: {e}")



Binary Classification submission file created: 'submission_binary_classification.csv'
Multi-Class Classification submission file created: 'submission_multi_classification.csv'
Regression submission file created: 'submission_regression.csv'


NameError: name 'verify_submission' is not defined

In [38]:

# a. Binary Classification Prediction
# Ensure that 'best_binary_model' is already defined and trained
test_pred_bin = best_binary_model.predict(test_X)

# b. Multi-Class Classification Prediction
# Ensure that 'best_multi_model' is already defined and trained
test_pred_multi = best_multi_model.predict(test_X)

# c. Regression Prediction
# Ensure that 'reg_pipeline' is already defined and trained
test_pred_reg = reg_pipeline.predict(test_X)


In [47]:
test_df.columns

Index(['status', 'departure.iataCode', 'departure.icaoCode',
       'departure.scheduledTime', 'departure.estimatedRunway',
       'departure.actualRunway', 'arrival.scheduledTime',
       'arrival.estimatedTime', 'airline.iataCode', 'airline.icaoCode',
       'flight.number', 'flight.iataNumber', 'flight.icaoNumber',
       'departure.terminal', 'arrival.actualTime', 'arrival.estimatedRunway',
       'arrival.actualRunway', 'departure.hour_of_day', 'departure.month',
       'status_encoded', 'Departure_Date', 'Departure_Month', 'Departure_Day',
       'Month', 'Day', 'Temperature (°F) Max', 'Temperature (°F) Avg',
       'Temperature (°F) Min', 'Dew Point (°F) Max', 'Dew Point (°F) Avg',
       'Dew Point (°F) Min', 'Humidity (%) Max', 'Humidity (%) Avg',
       'Humidity (%) Min', 'Wind Speed (mph) Max', 'Wind Speed (mph) Avg',
       'Wind Speed (mph) Min', 'Pressure (in) Max', 'Pressure (in) Avg',
       'Pressure (in) Min', 'Precipitation (in) Total', 'Departure_Hour',
       'dep

In [45]:

# 4. Define a Function to Create Submission Files
def create_submission(test_df, predictions, submission_type='binary'):
    """
    Creates a submission DataFrame for Kaggle competition.

    Parameters:
    - test_df (pd.DataFrame): The test dataset.
    - predictions (array-like): The predictions made by the model.
    - submission_type (str): Type of submission ('binary', 'multi', 'regression').

    Returns:
    - submission (pd.DataFrame): The formatted submission DataFrame.
    """
    # Define the required columns as per Kaggle's submission format
    required_columns = [
        'File Name', 'Flight Number', 'Type', 'Status',
        'Departure IATA Code', 'Departure ICAO Code',
        'Scheduled Time', 'Arrival IATA Code',
        'Arrival ICAO Code', 'Arrival Estimated Time'
    ]
    
    # Check if all required columns are present in test_df
    missing_cols = [col for col in required_columns if col not in test_df.columns]
    if missing_cols:
        raise ValueError(f"The following required columns are missing in test_df: {missing_cols}")
    
    # Create the submission DataFrame with required columns
    submission = test_df[required_columns].copy()
    
    # Assign the predictions to the 'Delay' column based on submission type
    if submission_type == 'binary':
        # Ensure predictions are strings: "on-time" or "delayed"
        if not pd.api.types.is_string_dtype(predictions):
            predictions = predictions.astype(str)
        submission['Delay'] = predictions
    elif submission_type == 'multi':
        # Ensure predictions are strings: "No Delay", "Short Delay", etc.
        if not pd.api.types.is_string_dtype(predictions):
            predictions = predictions.astype(str)
        submission['Delay'] = predictions
    elif submission_type == 'regression':
        # Ensure predictions are integers (minutes)
        predictions = np.round(predictions).astype(int)
        submission['Delay'] = predictions
    else:
        raise ValueError("Invalid submission_type. Choose from 'binary', 'multi', or 'regression'.")
    
    return submission

# 5. Create Submission Files

# a. Binary Classification Submission
try:
    submission_bin = create_submission(test_df, test_pred_bin, submission_type='binary')
    submission_bin.to_csv('submission_binary_classification.csv', index=False)
    print("Binary Classification submission file created: 'submission_binary_classification.csv'")
except Exception as e:
    print(f"Error creating Binary Classification submission: {e}")

# b. Multi-Class Classification Submission
try:
    submission_multi = create_submission(test_df, test_pred_multi, submission_type='multi')
    submission_multi.to_csv('submission_multi_classification.csv', index=False)
    print("Multi-Class Classification submission file created: 'submission_multi_classification.csv'")
except Exception as e:
    print(f"Error creating Multi-Class Classification submission: {e}")

# c. Regression Submission
try:
    submission_reg = create_submission(test_df, test_pred_reg, submission_type='regression')
    submission_reg.to_csv('submission_regression.csv', index=False)
    print("Regression submission file created: 'submission_regression.csv'")
except Exception as e:
    print(f"Error creating Regression submission: {e}")



Error creating Binary Classification submission: The following required columns are missing in test_df: ['File Name', 'Flight Number', 'Type', 'Status', 'Departure IATA Code', 'Departure ICAO Code', 'Scheduled Time', 'Arrival IATA Code', 'Arrival ICAO Code', 'Arrival Estimated Time']
Error creating Multi-Class Classification submission: The following required columns are missing in test_df: ['File Name', 'Flight Number', 'Type', 'Status', 'Departure IATA Code', 'Departure ICAO Code', 'Scheduled Time', 'Arrival IATA Code', 'Arrival ICAO Code', 'Arrival Estimated Time']
Error creating Regression submission: The following required columns are missing in test_df: ['File Name', 'Flight Number', 'Type', 'Status', 'Departure IATA Code', 'Departure ICAO Code', 'Scheduled Time', 'Arrival IATA Code', 'Arrival ICAO Code', 'Arrival Estimated Time']


In [None]:
# 6. Verify Submission Files (Optional but Recommended)
def verify_submission(file_path, submission_type='binary'):
    """
    Verifies the submission file for correctness.

    Parameters:
    - file_path (str): Path to the submission CSV file.
    - submission_type (str): Type of submission ('binary', 'multi', 'regression').

    Returns:
    - None
    """
    submission = pd.read_csv(file_path)
    
    # Check for required columns
    required_columns = [
        'File Name', 'Flight Number', 'Type', 'Status',
        'Departure IATA Code', 'Departure ICAO Code',
        'Scheduled Time', 'Arrival IATA Code',
        'Arrival ICAO Code', 'Arrival Estimated Time',
        'Delay'
    ]
    
    missing_cols = [col for col in required_columns if col not in submission.columns]
    if missing_cols:
        print(f"Submission file {file_path} is missing columns: {missing_cols}")
        return
    
    # Check for correct data types
    if submission_type in ['binary', 'multi']:
        if not submission['Delay'].dtype == object:
            print(f"'Delay' column in {file_path} should be of type object (string).")
    elif submission_type == 'regression':
        if not np.issubdtype(submission['Delay'].dtype, np.integer) and not np.issubdtype(submission['Delay'].dtype, np.floating):
            print(f"'Delay' column in {file_path} should be numeric (int or float).")
    
    # Check for missing values in 'Delay' column
    if submission['Delay'].isnull().any():
        print(f"'Delay' column in {file_path} contains missing values.")
    
    print(f"Submission file {file_path} passed verification.")



In [None]:
# Verify each submission file
verify_submission('submission_binary_classification.csv', submission_type='binary')
verify_submission('submission_multi_classification.csv', submission_type='multi')
verify_submission('submission_regression.csv', submission_type='regression')

# ==============================================
# End of Phase 5: Model Testing and Submission
# ==============================================


In [None]:
# Choose the best performing model based on cross-validation scores
# For demonstration, assume RandomForest performed best

# ==============================================
# PHASE 5: Model Testing and Submission
# ==============================================

# 1. Make Predictions on the Test Dataset

# Prepare the test set features (ensure same preprocessing)
test_X = X_test_full.drop(['departure.delay_minutes', 'binary_target', 'multi_target'], axis=1, errors='ignore')

# Binary Classification Prediction using best_binary_model
test_pred_bin = best_binary_model.predict(test_X)

# Multi-Class Classification Prediction (if you have optimized multi-class model)
# Assuming you have a best_multi_model (similar to best_binary_model)
# For this example, we'll use the previously trained multi_clf_pipeline
test_pred_multi = multi_clf_pipeline.predict(test_X)

# Regression Prediction using reg_pipeline
test_pred_reg = reg_pipeline.predict(test_X)

# 2. Save Predictions in Kaggle Submission Format

# Define a function to create submission files
def create_submission(test_df, predictions, submission_type='binary'):
    # Ensure that the necessary columns are present in test_df
    required_columns = [
        'File Name', 'Flight Number', 'Type', 'Status',
        'Departure IATA Code', 'Departure ICAO Code',
        'Scheduled Time', 'Arrival IATA Code',
        'Arrival ICAO Code', 'Arrival Estimated Time'
    ]
    
    # Check if required columns exist
    for col in required_columns:
        if col not in test_df.columns:
            raise ValueError(f"Missing required column: {col}")
    
    submission = test_df[required_columns].copy()
    
    if submission_type == 'binary':
        # Convert predictions to string format
        submission['Delay'] = predictions
    elif submission_type == 'multi':
        submission['Delay'] = predictions
    elif submission_type == 'regression':
        # Convert delay minutes to integer or keep as float based on Kaggle requirements
        submission['Delay'] = predictions.round().astype(int)
    else:
        raise ValueError("Invalid submission type. Choose from 'binary', 'multi', or 'regression'.")
    
    return submission

# Create Binary Classification Submission
submission_bin = create_submission(test_df, test_pred_bin, submission_type='binary')
submission_bin.to_csv('submission_binary_classification.csv', index=False)
print("Binary Classification submission file created: 'submission_binary_classification.csv'")

# Create Multi-Class Classification Submission
submission_multi = create_submission(test_df, test_pred_multi, submission_type='multi')
submission_multi.to_csv('submission_multi_classification.csv', index=False)
print("Multi-Class Classification submission file created: 'submission_multi_classification.csv'")

# Create Regression Submission
submission_reg = create_submission(test_df, test_pred_reg, submission_type='regression')
submission_reg.to_csv('submission_regression.csv', index=False)
print("Regression submission file created: 'submission_regression.csv'")

# ==============================================
# Additional: Feature Importance Plot (Optional)
# ==============================================

# Plot feature importances for Binary Classification
def plot_feature_importances(model, preprocessor, numerical_features, categorical_features, top_n=20):
    # Get feature names after preprocessing
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    ohe_features = ohe.get_feature_names_out(categorical_features)
    feature_names = numerical_features + list(ohe_features)
    
    # Get feature importances from the model
    importances = model.named_steps['classifier'].feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Plot
    plt.figure(figsize=(10, 8))
    sns.barplot(x=importances[indices][:top_n], y=np.array(feature_names)[indices][:top_n])
    plt.title(f"Top {top_n} Feature Importances")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()

# Example: Plot feature importances for best_binary_model
plot_feature_importances(best_binary_model, preprocessor, numerical_features, categorical_features, top_n=20)

# ==============================================
# END OF SCRIPT
# ==============================================

In [None]:
##############################################
# PHASE 5: Model Testing and Submission
##############################################

# Use best models to predict on test set
# Make sure test_df has the same preprocessing applied, i.e., same columns/features.
test_X = test_df[numeric_features + categorical_features]

# Example for Binary Classification Prediction on Test Set
test_pred_bin = best_binary_model.predict(test_X)

# For submission, ensure the format matches Kaggle's requirements
submission = test_df[['File Name', 'Flight Number', 'Type', 'Status', 
                      'Departure IATA Code', 'Departure ICAO Code', 
                      'Scheduled Time', 'Arrival IATA Code', 
                      'Arrival ICAO Code', 'Arrival Estimated Time']].copy()

# Delay must be string like "on-time" or "delayed"
submission['Delay'] = test_pred_bin.astype(str)

submission.to_csv('submission_binary_classification.csv', index=False)

# For multi-class classification and regression, do the same:
# test_pred_multi = multi_clf_pipeline.predict(test_X)
# test_pred_reg = reg_pipeline.predict(test_X)

# Then save their submissions accordingly, ensuring correct format and data types.


In [None]:
import pandas as pd
import numpy as np

# ==============================================
# Phase 5: Model Testing and Submission
# ==============================================

# 1. Load the Test Dataset
# Replace 'test.csv' with your actual test file path if different
test_df = pd.read_csv('test.csv')

# 2. Make Predictions Using Trained Models
# Assuming you have already made predictions and stored them in the following variables:
# - test_pred_bin: Predictions for Binary Classification ('on-time' or 'delayed')
# - test_pred_multi: Predictions for Multi-Class Classification ('No Delay', 'Short Delay', etc.)
# - test_pred_reg: Predictions for Regression (delay in minutes)

# For demonstration, let's assume the predictions are already made:
# Uncomment and replace with your actual prediction code if not already done.

# Example:
# test_pred_bin = best_binary_model.predict(test_X)
# test_pred_multi = best_multi_model.predict(test_X)
# test_pred_reg = reg_pipeline.predict(test_X)

# For the purpose of this example, let's create dummy predictions.
# Remove these lines when you have actual predictions.
# ---------------------------------------------------
# test_pred_bin = np.random.choice(['on-time', 'delayed'], size=test_df.shape[0])
# test_pred_multi = np.random.choice(['No Delay', 'Short Delay', 'Moderate Delay', 'Long Delay'], size=test_df.shape[0])
# test_pred_reg = np.random.randint(0, 300, size=test_df.shape[0])
# ---------------------------------------------------

# 3. Define a Function to Create Submission Files
def create_submission(test_df, predictions, submission_type='binary'):
    """
    Creates a submission DataFrame for Kaggle competition.

    Parameters:
    - test_df (pd.DataFrame): The test dataset.
    - predictions (array-like): The predictions made by the model.
    - submission_type (str): Type of submission ('binary', 'multi', 'regression').

    Returns:
    - submission (pd.DataFrame): The formatted submission DataFrame.
    """
    # Define the required columns as per Kaggle's submission format
    required_columns = [
        'File Name', 'Flight Number', 'Type', 'Status',
        'Departure IATA Code', 'Departure ICAO Code',
        'Scheduled Time', 'Arrival IATA Code',
        'Arrival ICAO Code', 'Arrival Estimated Time'
    ]
    
    # Check if all required columns are present in test_df
    missing_cols = [col for col in required_columns if col not in test_df.columns]
    if missing_cols:
        raise ValueError(f"The following required columns are missing in test_df: {missing_cols}")
    
    # Create the submission DataFrame with required columns
    submission = test_df[required_columns].copy()
    
    # Assign the predictions to the 'Delay' column based on submission type
    if submission_type == 'binary':
        # Ensure predictions are strings: "on-time" or "delayed"
        if not pd.api.types.is_string_dtype(predictions):
            predictions = predictions.astype(str)
        submission['Delay'] = predictions
    elif submission_type == 'multi':
        # Ensure predictions are strings: "No Delay", "Short Delay", etc.
        if not pd.api.types.is_string_dtype(predictions):
            predictions = predictions.astype(str)
        submission['Delay'] = predictions
    elif submission_type == 'regression':
        # Ensure predictions are integers (minutes)
        predictions = np.round(predictions).astype(int)
        submission['Delay'] = predictions
    else:
        raise ValueError("Invalid submission_type. Choose from 'binary', 'multi', or 'regression'.")
    
    return submissiontest_pred_bin

# 4. Create Submission Files

# a. Binary Classification Submission
try:
    submission_bin = create_submission(test_df, test_pred_bin, submission_type='binary')
    submission_bin.to_csv('submission_binary_classification.csv', index=False)
    print("Binary Classification submission file created: 'submission_binary_classification.csv'")
except Exception as e:
    print(f"Error creating Binary Classification submission: {e}")

# b. Multi-Class Classification Submission
try:
    submission_multi = create_submission(test_df, test_pred_multi, submission_type='multi')
    submission_multi.to_csv('submission_multi_classification.csv', index=False)
    print("Multi-Class Classification submission file created: 'submission_multi_classification.csv'")
except Exception as e:
    print(f"Error creating Multi-Class Classification submission: {e}")

# c. Regression Submission
try:
    submission_reg = create_submission(test_df, test_pred_reg, submission_type='regression')
    submission_reg.to_csv('submission_regression.csv', index=False)
    print("Regression submission file created: 'submission_regression.csv'")
except Exception as e:
    print(f"Error creating Regression submission: {e}")

# 5. Verify Submission Files (Optional but Recommended)
def verify_submission(file_path, submission_type='binary'):
    """
    Verifies the submission file for correctness.

    Parameters:
    - file_path (str): Path to the submission CSV file.
    - submission_type (str): Type of submission ('binary', 'multi', 'regression').

    Returns:
    - None
    """
    submission = pd.read_csv(file_path)
    
    # Check for required columns
    required_columns = [
        'File Name', 'Flight Number', 'Type', 'Status',
        'Departure IATA Code', 'Departure ICAO Code',
        'Scheduled Time', 'Arrival IATA Code',
        'Arrival ICAO Code', 'Arrival Estimated Time',
        'Delay'
    ]
    
    missing_cols = [col for col in required_columns if col not in submission.columns]
    if missing_cols:
        print(f"Submission file {file_path} is missing columns: {missing_cols}")
        return
    
    # Check for correct data types
    if submission_type in ['binary', 'multi']:
        if not submission['Delay'].dtype == object:
            print(f"'Delay' column in {file_path} should be of type object (string).")
    elif submission_type == 'regression':
        if not np.issubdtype(submission['Delay'].dtype, np.integer) and not np.issubdtype(submission['Delay'].dtype, np.floating):
            print(f"'Delay' column in {file_path} should be numeric (int or float).")
    
    # Check for missing values in 'Delay' column
    if submission['Delay'].isnull().any():
        print(f"'Delay' column in {file_path} contains missing values.")
    
    print(f"Submission file {file_path} passed verification.")

# Verify each submission file
verify_submission('submission_binary_classification.csv', submission_type='binary')
verify_submission('submission_multi_classification.csv', submission_type='multi')
verify_submission('submission_regression.csv', submission_type='regression')

# ==============================================
# End of Phase 5: Model Testing and Submission
# ==============================================


# sample code

In [None]:
# ==============================================
# 5. Create Submission Files
# ==============================================

def create_submission(test_df, predictions, submission_type='binary'):
    """
    Creates a submission DataFrame for Kaggle competition.

    Parameters:
    - test_df (pd.DataFrame): The test dataset.
    - predictions (array-like): The predictions made by the model.
    - submission_type (str): Type of submission ('binary', 'multi', 'regression').

    Returns:
    - submission (pd.DataFrame): The formatted submission DataFrame.
    """
    # Create an 'ID' column starting from 1 for each row in the test dataset
    submission = pd.DataFrame({
        'ID': test_df.index + 1  # ID should be from 1, not 0
    })

    if submission_type == 'binary':
        # Map the binary predictions to 'on-time' and 'delayed'
        submission['Delay'] = np.where(predictions == 0, 'on-time', 'delayed')

    elif submission_type == 'multi':
        # Map multi-class predictions to 'No Delay', 'Short Delay', 'Moderate Delay', 'Long Delay'
        # Assuming the model outputs values like 0, 1, 2, 3 for these categories
        delay_map = {0: 'No Delay', 1: 'Short Delay', 2: 'Moderate Delay', 3: 'Long Delay'}
        submission['Delay'] = [delay_map.get(pred, 'UNKNOWN') for pred in predictions]

    elif submission_type == 'regression':
        # For regression, the predictions are continuous, so directly assign the predicted delay
        submission['Delay'] = predictions

    # Ensure the submission format matches Kaggle requirements
    submission = submission[['ID', 'Delay']]
    return submission

# a. Binary Classification Submission
try:
    submission_bin = create_submission(test_df, test_pred_bin, submission_type='binary')
    submission_bin.to_csv('submission_binary_classification.csv', index=False)
    print("Binary Classification submission file created: 'submission_binary_classification.csv'")
except Exception as e:
    print(f"Error creating Binary Classification submission: {e}")

# b. Multi-Class Classification Submission
try:
    submission_multi = create_submission(test_df, test_pred_multi, submission_type='multi')
    submission_multi.to_csv('submission_multi_classification.csv', index=False)
    print("Multi-Class Classification submission file created: 'submission_multi_classification.csv'")
except Exception as e:
    print(f"Error creating Multi-Class Classification submission: {e}")

# c. Regression Submission
try:
    submission_reg = create_submission(test_df, test_pred_reg, submission_type='regression')
    submission_reg.to_csv('submission_regression.csv', index=False)
    print("Regression submission file created: 'submission_regression.csv'")
except Exception as e:
    print(f"Error creating Regression submission: {e}")


# 1. Training the Model (for Binary Classification):


In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Load and prepare your training data
train_df = pd.read_csv('merged_features_drop_cleaned_train_data.csv')  # Load your training data

# Ensure the training data has the correct number of rows
train_df = train_df.head(12914)

# Prepare features for training
numeric_features = ['Temperature (°F) Avg', 'Humidity (%) Avg', 'Wind Speed (mph) Avg', 'Pressure (in) Avg']
categorical_features = ['departure.iataCode', 'departure.icaoCode', 'airline.iataCode']

# Separate features (X) and target variable (y)
X_train = train_df[numeric_features + categorical_features]
y_train = train_df['Delay']  # Assuming 'Delay' is the target column

# Step 2: Preprocessing pipeline for categorical and numeric features
numeric_transformer = StandardScaler()
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Combine both numeric and categorical transformers in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 3: Define the pipeline with XGBoost classifier
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Step 4: Fit the binary classification model
xgb_pipeline.fit(X_train, y_train)

# Save the trained model for later use
import joblib
joblib.dump(xgb_pipeline, 'best_binary_model.pkl')

# Now you can use 'best_binary_model' to make predictions on the test set
