In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load datasets and create DELAYED column
df_1991 = pd.read_csv('1991_cleaned.csv.gz').sample(n=10000, random_state=42)
df_2001 = pd.read_csv('2001_cleaned.csv.gz').sample(n=10000, random_state=42)

df_1991['DELAYED'] = (df_1991['ArrDelay'] > 0).astype(int)
df_2001['DELAYED'] = (df_2001['ArrDelay'] > 0).astype(int)

df_1991 = df_1991.drop('ArrDelay', axis=1)
df_2001 = df_2001.drop('ArrDelay', axis=1)

# Step 2: Feature engineering
# Selecting relevant columns
cat_cols = ["UniqueCarrier", "Origin", "Dest"]
num_cols_1991 = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'FlightNum',
                 'ActualElapsedTime', 'DepDelay', 'Distance',
                 'Cancelled', 'Diverted']
num_cols_2001 = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'FlightNum',
                 'ActualElapsedTime', 'AirTime', 'DepDelay',
                 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted']

# Step 3: Create transformers for numerical and categorical columns
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a preprocessor
preprocessor_1991 = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols_1991),
        ('cat', cat_transformer, cat_cols)
    ])

preprocessor_2001 = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols_2001),
        ('cat', cat_transformer, cat_cols)
    ])

# Step 4: Train and evaluate ML models
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Split the data into training and testing sets
X_train_1991, X_test_1991, y_train_1991, y_test_1991 = train_test_split(
    df_1991.drop('DELAYED', axis=1), df_1991['DELAYED'], test_size=0.2, random_state=42)

X_train_2001, X_test_2001, y_train_2001, y_test_2001 = train_test_split(
    df_2001.drop('DELAYED', axis=1), df_2001['DELAYED'], test_size=0.2, random_state=42)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'XGBoost': XGBClassifier()
}

# Loop through models
for model_name, model in models.items():
    print(f"\nTraining and evaluating {model_name} on 1991 data:")
    model_pipe_1991 = make_pipeline(preprocessor_1991, model)
    accuracy_1991 = train_and_evaluate_model(model_pipe_1991, X_train_1991, y_train_1991, X_test_1991, y_test_1991)
    print(f"{model_name} Accuracy on 1991 data: {accuracy_1991 * 100:.3f}%")

    print(f"\nTraining and evaluating {model_name} on 2001 data:")
    model_pipe_2001 = make_pipeline(preprocessor_2001, model)
    accuracy_2001 = train_and_evaluate_model(model_pipe_2001, X_train_2001, y_train_2001, X_test_2001, y_test_2001)
    print(f"{model_name} Accuracy on 2001 data: {accuracy_2001 * 100:.3f}%")

    # Get feature importance for 1991
    if hasattr(model, 'feature_importances_'):
        feature_importance_1991 = model.feature_importances_
        features_1991 = X_train_1991.columns
        importance_dict_1991 = dict(zip(features_1991, feature_importance_1991))
        print(f"\nFeature Importance for {model_name} on 1991 data:")
        total_importance_1991 = sum(importance_dict_1991.values())
        for feature, importance in sorted(importance_dict_1991.items(), key=lambda x: x[1], reverse=True):
            importance_percentage_1991 = (importance / total_importance_1991) * 100
            print(f"{feature}: {importance_percentage_1991:.2f}%")

    # Get feature importance for 2001
    if hasattr(model, 'feature_importances_'):
        feature_importance_2001 = model.feature_importances_
        features_2001 = X_train_2001.columns
        importance_dict_2001 = dict(zip(features_2001, feature_importance_2001))
        print(f"\nFeature Importance for {model_name} on 2001 data:")
        total_importance_2001 = sum(importance_dict_2001.values())
        for feature, importance in sorted(importance_dict_2001.items(), key=lambda x: x[1], reverse=True):
            importance_percentage_2001 = (importance / total_importance_2001) * 100
            print(f"{feature}: {importance_percentage_2001:.2f}%")

# # Create empty DataFrames for accuracies and feature importance
# accuracy_table_1991 = pd.DataFrame(index=['Accuracy'])
# feature_importance_table_1991 = pd.DataFrame()

# accuracy_table_2001 = pd.DataFrame(index=['Accuracy'])
# feature_importance_table_2001 = pd.DataFrame()

# # Loop through models
# for model_name, model in models.items():
#     # Training and evaluating on 1991 data
#     model_pipe_1991 = make_pipeline(preprocessor_1991, model)
#     accuracy_1991 = train_and_evaluate_model(model_pipe_1991, X_train_1991, y_train_1991, X_test_1991, y_test_1991)
#     accuracy_table_1991[model_name] = [f"{accuracy_1991 * 100:.3f}%"]

#     # Get feature importance for 1991
#     if hasattr(model, 'feature_importances_'):
#         feature_importance_1991 = model.feature_importances_
#         features_1991 = X_train_1991.columns
#         importance_dict_1991 = dict(zip(features_1991, feature_importance_1991))
#         feature_importance_table_1991[model_name] = pd.Series(importance_dict_1991).apply(lambda x: f"{x * 100:.3f}%")

#     # Training and evaluating on 2001 data
#     model_pipe_2001 = make_pipeline(preprocessor_2001, model)
#     accuracy_2001 = train_and_evaluate_model(model_pipe_2001, X_train_2001, y_train_2001, X_test_2001, y_test_2001)
#     accuracy_table_2001[model_name] = [f"{accuracy_2001 * 100:.3f}%"]

#     # Get feature importance for 2001
#     if hasattr(model, 'feature_importances_'):
#         feature_importance_2001 = model.feature_importances_
#         features_2001 = X_train_2001.columns
#         importance_dict_2001 = dict(zip(features_2001, feature_importance_2001))
#         feature_importance_table_2001[model_name] = pd.Series(importance_dict_2001).apply(lambda x: f"{x * 100:.3f}%")

# # Display the tables
# print("Accuracy Table for 1991 Dataset:")
# print(accuracy_table_1991)

# print("\nAccuracy Table for 2001 Dataset:")
# print(accuracy_table_2001)

# print("\nFeature Importance Table for 1991 Dataset:")
# print(feature_importance_table_1991)

# print("\nFeature Importance Table for 2001 Dataset:")
# print(feature_importance_table_2001)



Training and evaluating Random Forest on 1991 data:
Random Forest Accuracy on 1991 data: 71.650%

Training and evaluating Random Forest on 2001 data:
Random Forest Accuracy on 2001 data: 80.450%

Feature Importance for Random Forest on 1991 data:
DepDelay: 35.21%
Distance: 12.97%
FlightNum: 7.36%
DepTime: 6.97%
ActualElapsedTime: 6.44%
Origin: 6.39%
UniqueCarrier: 6.03%
Dest: 5.77%
DayofMonth: 5.03%
Month: 4.32%
DayOfWeek: 3.50%
Cancelled: 0.00%
Diverted: 0.00%

Feature Importance for Random Forest on 2001 data:
ActualElapsedTime: 34.66%
Origin: 12.77%
FlightNum: 7.24%
DepTime: 6.87%
TailNum: 6.34%
AirTime: 6.29%
UniqueCarrier: 5.93%
DepDelay: 5.68%
DayofMonth: 4.95%
Month: 4.25%
DayOfWeek: 3.45%
TaxiIn: 0.63%
Cancelled: 0.42%
Diverted: 0.41%
TaxiOut: 0.10%
Dest: 0.00%
Distance: 0.00%

Training and evaluating SVM on 1991 data:
SVM Accuracy on 1991 data: 79.350%

Training and evaluating SVM on 2001 data:
SVM Accuracy on 2001 data: 83.350%

Training and evaluating Decision Tree on 1991 