In [1]:
import pandas as pd

# Read the CSV file with a specified encoding
df = pd.read_csv("cleaned_flight_data.csv", encoding="ISO-8859-1")

# Display the first 5 rows
df.head()

  df = pd.read_csv("cleaned_flight_data.csv", encoding="ISO-8859-1")


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum
0,1998,1,1,2,5,1998-01-02,NW,19386,NW,N297US,...,,,,,,,,,,
1,2009,2,5,28,4,2009-05-28,FL,20437,FL,N946AT,...,,,,,,,,,,
2,2013,2,6,29,6,2013-06-29,MQ,20398,MQ,N665MQ,...,,,,,,,,,,
3,2010,3,8,31,2,2010-08-31,DL,19790,DL,N6705Y,...,,,,,,,,,,
4,2006,1,1,15,7,2006-01-15,US,20355,US,N504AU,...,,,,,,,,,,


We view the shape of the dataset after cleaning, to see what we are working with.

In [2]:
# Display the number of rows and columns in the dataset
print(f"\nShape of the dataset: {df.shape}")


Shape of the dataset: (2000000, 85)


**RandomForest Model**

Random Forest Model-Attempt #1:

- We created a target variable DelayCategory is created based on flight delays, and categorical variables, like DelayCategory, are encoded using LabelEncoder. 

- During exploration, it was found that flights were usually delayed during the months of June and December, peak holiday seasons. Moreso, during 3am-4am, there were the longest delays.

- Time features (DepTime and ArrTime) are extracted into hours and minutes and then combined into single time features. 

- Additional features are engineered to capture seasonal delays and flight-specific patterns, such as LateNightFlight. 

- The data is split into training and test sets, then trained using a RandomForestClassifier to predict flight delay categories.

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is already loaded

# Fill missing values only for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Create target variable 'DelayCategory' for classification
def delay_category(row):
    if row['DepDelay'] > 0:
        return 'Departure Delay'
    elif row['ArrDelay'] > 0:
        return 'Arrival Delay'
    else:
        return 'No Delay'

df['DelayCategory'] = df.apply(delay_category, axis=1)

# Encode the categorical variable 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

# Ensure 'DepTime' and 'ArrTime' columns exist before creating hour/minute columns
if 'DepTime' in df.columns:
    df['DepTime'] = df['DepTime'].fillna(0).astype(int)
    df['DepTime_hour'] = df['DepTime'] // 100
    df['DepTime_minute'] = df['DepTime'] % 100
else:
    print("Column 'DepTime' is missing!")

if 'ArrTime' in df.columns:
    df['ArrTime'] = df['ArrTime'].fillna(0).astype(int)
    df['ArrTime_hour'] = df['ArrTime'] // 100
    df['ArrTime_minute'] = df['ArrTime'] % 100
else:
    print("Column 'ArrTime' is missing!")

# Feature engineering: Combine hour and minute into a single time feature
if 'DepTime_hour' in df.columns and 'DepTime_minute' in df.columns:
    df['DepTime_combined'] = df['DepTime_hour'] * 60 + df['DepTime_minute']
else:
    print("Skipping 'DepTime_combined' due to missing columns!")

if 'ArrTime_hour' in df.columns and 'ArrTime_minute' in df.columns:
    df['ArrTime_combined'] = df['ArrTime_hour'] * 60 + df['ArrTime_minute']
else:
    print("Skipping 'ArrTime_combined' due to missing columns!")

# Adding features based on the exploration
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x == 6 or x == 12 else 0)
df['LateNightFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)
df['LateArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)

# New arrival-related features
df['CarrierDelay'] = df['DepDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)
df['WeatherDelay'] = df['ArrDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)
df['ArrDelayFlag'] = df['ArrDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)

# Features for training
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined',
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'LateNightFlight', 'LateArrivalFlight', 'ArrDelayFlag']

# Features matrix and target vector
X = df[features]
y = df['DelayCategory']

# Apply LabelEncoder to all categorical columns in X (if any)
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Ensure all features are numeric and fill any remaining missing values
X = X.fillna(X.median())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier with optimized parameters
clf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')

# Train the model
clf.fit(X_train, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.89      0.64     57077
           1       0.91      0.42      0.57    155057
           2       0.83      1.00      0.90    187866

    accuracy                           0.76    400000
   macro avg       0.74      0.77      0.71    400000
weighted avg       0.81      0.76      0.74    400000

Accuracy Score: 0.7577425


Random Forest Model-Attempt #2:

- Building on the feedback from our meeting, we combined the departure and arrival delays into a single "Delay" category. 

- This led to improved results. 

- Additionally, we now have a clear binary classifier where a value of 1 indicates a "Delay" and 0 indicates "No Delay."

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is already loaded

# Fill missing values only for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Create target variable 'DelayCategory' for binary classification (0 = no delay, 1 = delay)
def delay_category(row):
    if row['DepDelay'] > 0 or row['ArrDelay'] > 0:
        return 1  # Delay
    else:
        return 0  # No Delay

df['DelayCategory'] = df.apply(delay_category, axis=1)

# Encode the categorical variable 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

# Ensure 'DepTime' and 'ArrTime' columns exist before creating hour/minute columns
if 'DepTime' in df.columns:
    df['DepTime'] = df['DepTime'].fillna(0).astype(int)
    df['DepTime_hour'] = df['DepTime'] // 100
    df['DepTime_minute'] = df['DepTime'] % 100
else:
    print("Column 'DepTime' is missing!")

if 'ArrTime' in df.columns:
    df['ArrTime'] = df['ArrTime'].fillna(0).astype(int)
    df['ArrTime_hour'] = df['ArrTime'] // 100
    df['ArrTime_minute'] = df['ArrTime'] % 100
else:
    print("Column 'ArrTime' is missing!")

# Feature engineering: Combine hour and minute into a single time feature
if 'DepTime_hour' in df.columns and 'DepTime_minute' in df.columns:
    df['DepTime_combined'] = df['DepTime_hour'] * 60 + df['DepTime_minute']
else:
    print("Skipping 'DepTime_combined' due to missing columns!")

if 'ArrTime_hour' in df.columns and 'ArrTime_minute' in df.columns:
    df['ArrTime_combined'] = df['ArrTime_hour'] * 60 + df['ArrTime_minute']
else:
    print("Skipping 'ArrTime_combined' due to missing columns!")

# Adding features based on the exploration
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x == 6 or x == 12 else 0)
df['LateNightFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)
df['LateArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)

# New arrival-related features
df['CarrierDelay'] = df['DepDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)
df['WeatherDelay'] = df['ArrDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)
df['ArrDelayFlag'] = df['ArrDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)

# Features for training
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined',
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'LateNightFlight', 'LateArrivalFlight', 'ArrDelayFlag']

# Features matrix and target vector
X = df[features]
y = df['DelayCategory']

# Apply LabelEncoder to all categorical columns in X (if any)
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Ensure all features are numeric and fill any remaining missing values
X = X.fillna(X.median())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier with optimized parameters
clf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')

# Train the model
clf.fit(X_train, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90    187866
           1       1.00      0.81      0.90    212134

    accuracy                           0.90    400000
   macro avg       0.91      0.91      0.90    400000
weighted avg       0.92      0.90      0.90    400000

Accuracy Score: 0.90031


We calculated the ROC AUC score by predicting probabilities for the positive class and evaluating the model's performance with the roc_auc_score function.

In [4]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for ROC AUC
y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9488911386288418


We evaluated the model on the training set by calculating the classification report, accuracy score, and ROC AUC score, with predictions on both the training set's labels and probabilities.

In [5]:
# Evaluate the model on training set
y_train_pred = clf.predict(X_train)
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy Score:", accuracy_score(y_train, y_train_pred))

# ROC AUC on training set
y_train_pred_proba = clf.predict_proba(X_train)[:, 1]
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)
print("Training ROC AUC Score:", train_roc_auc)

Training Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.91    751659
           1       1.00      0.81      0.90    848341

    accuracy                           0.90   1600000
   macro avg       0.91      0.91      0.90   1600000
weighted avg       0.92      0.90      0.90   1600000

Training Accuracy Score: 0.901685625
Training ROC AUC Score: 0.960859596068427


Random Forest Model-Attempt #3:

- We included July in the peak months to expand the analysis, fixing the previous limitations and improving the model’s ability to account for delays during the summer months.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is already loaded

# Fill missing values only for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Create target variable 'DelayCategory' for binary classification (0 = no delay, 1 = delay)
def delay_category(row):
    if row['DepDelay'] > 0 or row['ArrDelay'] > 0:
        return 1  # Delay
    else:
        return 0  # No Delay

df['DelayCategory'] = df.apply(delay_category, axis=1)

# Encode the categorical variable 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

# Ensure 'DepTime' and 'ArrTime' columns exist before creating hour/minute columns
if 'DepTime' in df.columns:
    df['DepTime'] = df['DepTime'].fillna(0).astype(int)
    df['DepTime_hour'] = df['DepTime'] // 100
    df['DepTime_minute'] = df['DepTime'] % 100
else:
    print("Column 'DepTime' is missing!")

if 'ArrTime' in df.columns:
    df['ArrTime'] = df['ArrTime'].fillna(0).astype(int)
    df['ArrTime_hour'] = df['ArrTime'] // 100
    df['ArrTime_minute'] = df['ArrTime'] % 100
else:
    print("Column 'ArrTime' is missing!")

# Feature engineering: Combine hour and minute into a single time feature
if 'DepTime_hour' in df.columns and 'DepTime_minute' in df.columns:
    df['DepTime_combined'] = df['DepTime_hour'] * 60 + df['DepTime_minute']
else:
    print("Skipping 'DepTime_combined' due to missing columns!")

if 'ArrTime_hour' in df.columns and 'ArrTime_minute' in df.columns:
    df['ArrTime_combined'] = df['ArrTime_hour'] * 60 + df['ArrTime_minute']
else:
    print("Skipping 'ArrTime_combined' due to missing columns!")

# Adding features based on the exploration
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x in [6, 7, 12] else 0)  # June, July, December
df['LateNightFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)  # Between 3-4 AM
df['LateArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)  # Between 3-4 AM

# New arrival-related features
df['CarrierDelay'] = df['DepDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)
df['WeatherDelay'] = df['ArrDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)
df['ArrDelayFlag'] = df['ArrDelay'].apply(lambda x: 1 if pd.notna(x) and x > 0 else 0)

# Features for training
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined',
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'LateNightFlight', 'LateArrivalFlight', 'ArrDelayFlag']

# Features matrix and target vector
X = df[features]
y = df['DelayCategory']

# Apply LabelEncoder to all categorical columns in X (if any)
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Ensure all features are numeric and fill any remaining missing values
X = X.fillna(X.median())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier with optimized parameters
clf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')

# Train the model
clf.fit(X_train, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90    187866
           1       1.00      0.81      0.90    212134

    accuracy                           0.90    400000
   macro avg       0.91      0.91      0.90    400000
weighted avg       0.92      0.90      0.90    400000

Accuracy Score: 0.9003275


We noticed only a slight improvement.

We calculated the ROC AUC score by predicting probabilities for the positive class and evaluating the model's performance with the roc_auc_score function.

In [8]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for ROC AUC
y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9487515829705504


We evaluated the model using cross-validation with fewer folds (3) and parallel processing to speed up the evaluation process.

In [10]:
from sklearn.model_selection import cross_val_score

# Evaluate using cross-validation with fewer folds and parallel processing
cv_scores = cross_val_score(clf, X, y, cv=3, n_jobs=-1)
print("Cross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())

Cross-validation scores: [0.90058005 0.90000705 0.9001929 ]
Average CV score: 0.9002599999664501


Random Forest - Model Attempt #4:

- Data exploration revealed that both Weather delay and Carrier delay are positively correlated with overall delay. Specifically, Weather delay significantly increases flight delays, particularly beyond 1250 minutes. Carrier delay shows a concentration between 0 and 250 minutes, with varying departure delays, suggesting the influence of other delay factors.

- To resolve the issue where Weather and Carrier delays returned a result of 1.0, we created new delay-related features in the code: 'CarrierDelay', 'WeatherDelay', 'ArrDelayFlag', and derived features like 'WeatherDelayHigh' and 'CarrierDelayMid'.

   1) Weather Delay: We created the feature WeatherDelayHigh to specifically flag weather delays over 1250 minutes. This was derived from the ArrDelay column, where any value over 0 was considered a delay, and further filtered for extreme weather delays.

   2) Carrier Delay: We generated the CarrierDelayMid feature, which flags flights with any carrier delay, represented by values greater than 0 in DepDelay.

- After selecting relevant features and applying LabelEncoder to categorical variables, the data was split for training and testing. We trained a RandomForestClassifier with specific hyperparameters and evaluated the model using a classification report and accuracy score.

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Fill missing values for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Create binary target variable 'DelayCategory'
df['DelayCategory'] = df.apply(lambda row: 1 if row['DepDelay'] > 0 or row['ArrDelay'] > 0 else 0, axis=1)

# Encode 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

# Feature engineering (time-related features)
df['DepTime_combined'] = df['DepTime'] // 100 * 60 + df['DepTime'] % 100
df['ArrTime_combined'] = df['ArrTime'] // 100 * 60 + df['ArrTime'] % 100

# Adding seasonal and time-of-day features
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x in [6, 7, 12] else 0)
df['LateNightFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)
df['LateArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)

# Delay-related features
df['CarrierDelay'] = df['DepDelay'].apply(lambda x: 1 if x > 0 else 0)
df['WeatherDelay'] = df['ArrDelay'].apply(lambda x: 1 if x > 0 else 0)
df['ArrDelayFlag'] = df['ArrDelay'].apply(lambda x: 1 if x > 0 else 0)

df['WeatherDelayHigh'] = df['WeatherDelay'].apply(lambda x: 1 if x > 1250 else 0)
df['CarrierDelayMid'] = df['CarrierDelay'].apply(lambda x: 1 if x == 1 else 0)

# Feature selection for training
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined', 
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'LateNightFlight', 'LateArrivalFlight', 'WeatherDelayHigh', 'CarrierDelayMid']

# Prepare feature matrix and target vector
X = df[features]
y = df['DelayCategory']

# Apply LabelEncoder to categorical features
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Fill missing values and ensure all features are numeric
X = X.fillna(X.median())

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier with hyperparameters
clf = RandomForestClassifier(n_estimators=200, max_depth=15, max_features='sqrt', min_samples_split=10, 
                             min_samples_leaf=5, random_state=42, class_weight='balanced')

# Train the model
clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92    187866
           1       1.00      0.85      0.92    212134

    accuracy                           0.92    400000
   macro avg       0.92      0.92      0.92    400000
weighted avg       0.93      0.92      0.92    400000

Accuracy Score: 0.91758


In [14]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation with 3 folds and parallel processing
cv_scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy', n_jobs=-1)

# Print the mean and standard deviation of the cross-validation scores
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Cross-validation Accuracy: 0.9175 ± 0.0008


Random Forest Model Attempt #5:

- We adjusted the Carrier Delay range, focusing on the 0-250 minutes range based on the data exploration. This resulted in a slight increase in accuracy but did not produce other significant improvements.

- We observed that while the overall Weather Delay improves accuracy, training on all Weather data leads to overfitting. Therefore, we limited the Weather feature to the higher delays, which helped avoid overfitting and produced better results overall.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Fill missing values for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Create binary target variable 'DelayCategory'
df['DelayCategory'] = df.apply(lambda row: 1 if row['DepDelay'] > 0 or row['ArrDelay'] > 0 else 0, axis=1)

# Encode 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

# Feature engineering (time-related features)
df['DepTime_combined'] = df['DepTime'] // 100 * 60 + df['DepTime'] % 100
df['ArrTime_combined'] = df['ArrTime'] // 100 * 60 + df['ArrTime'] % 100

# Adding seasonal and time-of-day features
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x in [6, 7, 12] else 0)
df['LateNightFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)
df['LateArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)

# Delay-related features
df['CarrierDelay'] = df['DepDelay'].apply(lambda x: 1 if x > 0 else 0)
df['WeatherDelay'] = df['ArrDelay'].apply(lambda x: 1 if x > 0 else 0)
df['ArrDelayFlag'] = df['ArrDelay'].apply(lambda x: 1 if x > 0 else 0)

df['WeatherDelayHigh'] = df['WeatherDelay'].apply(lambda x: 1 if x > 1250 else 0)
df['CarrierDelayMid'] = df['CarrierDelay'].apply(lambda x: 1 if 0 < x <= 250 else 0)

# Feature selection for training
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined', 
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'LateNightFlight', 'LateArrivalFlight', 'WeatherDelayHigh', 'CarrierDelayMid']

# Prepare feature matrix and target vector
X = df[features]
y = df['DelayCategory']

# Apply LabelEncoder to categorical features
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Fill missing values and ensure all features are numeric
X = X.fillna(X.median())

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier with hyperparameters
clf = RandomForestClassifier(n_estimators=200, max_depth=15, max_features='sqrt', min_samples_split=10, 
                             min_samples_leaf=5, random_state=42, class_weight='balanced')

# Train the model
clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92    187866
           1       1.00      0.85      0.92    212134

    accuracy                           0.92    400000
   macro avg       0.92      0.92      0.92    400000
weighted avg       0.93      0.92      0.92    400000

Accuracy Score: 0.91758
