In [1]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# Load datasets
df_1991 = pd.read_csv('1991_cleaned.csv.gz')
df_2001 = pd.read_csv('2001_cleaned.csv.gz')

# Create the 'DELAYED' column
df_1991['DELAYED'] = (df_1991['ArrDelay'] > 0).astype(int)
df_2001['DELAYED'] = (df_2001['ArrDelay'] > 0).astype(int)

# Drop the 'ArrDelay' column
df_1991 = df_1991.drop('ArrDelay', axis=1)
df_2001 = df_2001.drop('ArrDelay', axis=1)

# Define numerical and categorical columns for each dataset
num_cols_1991 = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'FlightNum',
                 'ActualElapsedTime', 'DepDelay', 'Distance',
                 'Cancelled', 'Diverted']
num_cols_2001 = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'FlightNum',
                 'ActualElapsedTime', 'AirTime', 'DepDelay', 'Distance',
                 'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted']
cat_cols_1991 = ["UniqueCarrier", "Origin", "Dest"]
cat_cols_2001 = ["UniqueCarrier", "Origin", "Dest"]

# Separate the target variable
target_variable = ['DELAYED']

# Store a copy of the target variable separately
df_1991_target = df_1991[target_variable].copy()
df_2001_target = df_2001[target_variable].copy()

# Drop the target variable for feature scaling
df_1991_features = df_1991.drop(target_variable, axis=1)
df_2001_features = df_2001.drop(target_variable, axis=1)

# Apply label encoding to categorical variables using factorize
for col in cat_cols_1991:
    df_1991_features[col], _ = pd.factorize(df_1991_features[col])

for col in cat_cols_2001:
    df_2001_features[col], _ = pd.factorize(df_2001_features[col])

# Concatenate numerical and categorical columns
df_1991_combined = pd.concat([df_1991_features[num_cols_1991], df_1991_features[cat_cols_1991]], axis=1)
df_2001_combined = pd.concat([df_2001_features[num_cols_2001], df_2001_features[cat_cols_2001]], axis=1)

# Standardize combined numerical and categorical variables using the mean and standard deviation of each dataset
scaler_1991 = StandardScaler()
scaler_2001 = StandardScaler()

df_1991_scaled = scaler_1991.fit_transform(df_1991_combined)
df_1991_scaled = pd.DataFrame(df_1991_scaled, columns=df_1991_combined.columns)
df_1991_scaled['DELAYED'] = df_1991_target.reset_index(drop=True)  # Reset index

df_2001_scaled = scaler_2001.fit_transform(df_2001_combined)
df_2001_scaled = pd.DataFrame(df_2001_scaled, columns=df_2001_combined.columns)
df_2001_scaled['DELAYED'] = df_2001_target.reset_index(drop=True)  # Reset index

os.makedirs("engineering", exist_ok=True)
pd.DataFrame(df_1991_scaled.sample(50)).to_csv('engineering/engineering_1991.csv', index=False)
pd.DataFrame(df_2001_scaled.sample(50)).to_csv('engineering/engineering_2001.csv', index=False)

# Split the 1991 dataset into training, testing, and validation sets
train_data_1991, test_data_1991 = train_test_split(df_1991_scaled, test_size=0.2, random_state=42)
train_data_1991, val_data_1991 = train_test_split(train_data_1991, test_size=0.2, random_state=42)

# Split the 2001 dataset into training, testing, and validation sets
train_data_2001, test_data_2001 = train_test_split(df_2001_scaled, test_size=0.2, random_state=42)
train_data_2001, val_data_2001 = train_test_split(train_data_2001, test_size=0.2, random_state=42)

# Define features and target for each dataset
X_train_1991 = train_data_1991[num_cols_1991 + cat_cols_1991]
y_train_1991 = train_data_1991['DELAYED']
X_val_1991 = val_data_1991[num_cols_1991 + cat_cols_1991]
y_val_1991 = val_data_1991['DELAYED']
X_test_1991 = test_data_1991[num_cols_1991 + cat_cols_1991]
y_test_1991 = test_data_1991['DELAYED']

X_train_2001 = train_data_2001[num_cols_2001 + cat_cols_2001]
y_train_2001 = train_data_2001['DELAYED']
X_val_2001 = val_data_2001[num_cols_2001 + cat_cols_2001]
y_val_2001 = val_data_2001['DELAYED']
X_test_2001 = test_data_2001[num_cols_2001 + cat_cols_2001]
y_test_2001 = test_data_2001['DELAYED']

# Create folders if they do not exist
folders = ['models', 'feature_importance', 'accuracies']
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)

# Function to extract feature importance from tree-based models
def get_tree_feature_importance(model, feature_names):
    return dict(zip(feature_names, model.feature_importances_))

# Define the Decision Tree model
dt_model_1991 = DecisionTreeClassifier(random_state=42)
dt_model_2001 = DecisionTreeClassifier(random_state=42)

# Fit the Decision Tree model to the 1991 dataset
dt_model_1991.fit(X_train_1991, y_train_1991)

# Fit the Decision Tree model to the 2001 dataset
dt_model_2001.fit(X_train_2001, y_train_2001)

# Extract feature importance for Decision Tree model (1991 dataset)
feature_importance_dt_1991 = get_tree_feature_importance(dt_model_1991, num_cols_1991 + cat_cols_1991)

# Extract feature importance for Decision Tree model (2001 dataset)
feature_importance_dt_2001 = get_tree_feature_importance(dt_model_2001, num_cols_2001 + cat_cols_2001)

# Define the XGBoost model
xgb_model_1991 = XGBClassifier(random_state=42)
xgb_model_2001 = XGBClassifier(random_state=42)

# Fit the XGBoost model to the 1991 dataset
xgb_model_1991.fit(X_train_1991, y_train_1991)

# Fit the XGBoost model to the 2001 dataset
xgb_model_2001.fit(X_train_2001, y_train_2001)

# Extract feature importance for XGBoost model (1991 dataset)
feature_importance_xgb_1991 = get_tree_feature_importance(xgb_model_1991, num_cols_1991 + cat_cols_1991)

# Extract feature importance for XGBoost model (2001 dataset)
feature_importance_xgb_2001 = get_tree_feature_importance(xgb_model_2001, num_cols_2001 + cat_cols_2001)

# Print feature importance for Decision Tree model (1991 dataset)
print("\nDecision Tree Feature Importance for 1991 Dataset:")
for feature, importance in sorted(feature_importance_dt_1991.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance*100:.2f}%")

# Print feature importance for Decision Tree model (2001 dataset)
print("\nDecision Tree Feature Importance for 2001 Dataset:")
for feature, importance in sorted(feature_importance_dt_2001.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance*100:.2f}%")

# Print feature importance for XGBoost model (1991 dataset)
print("\nXGBoost Feature Importance for 1991 Dataset:")
for feature, importance in sorted(feature_importance_xgb_1991.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance*100:.2f}%")

# Print feature importance for XGBoost model (2001 dataset)
print("\nXGBoost Feature Importance for 2001 Dataset:")
for feature, importance in sorted(feature_importance_xgb_2001.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance*100:.2f}%")

# Save Decision Tree models
joblib.dump(dt_model_1991, 'models/DT_1991_model.joblib')
joblib.dump(dt_model_2001, 'models/DT_2001_model.joblib')

# Save XGBoost models
joblib.dump(xgb_model_1991, 'models/XGB_1991_model.joblib')
joblib.dump(xgb_model_2001, 'models/XGB_2001_model.joblib')

# Save feature importance as CSV
feature_importance_dt_xgb_1991 = {'Feature': [], 'DT_Importance': [], 'XGB_Importance': []}
feature_importance_dt_xgb_2001 = {'Feature': [], 'DT_Importance': [], 'XGB_Importance': []}

for feature in num_cols_1991 + cat_cols_1991:
    feature_importance_dt_xgb_1991['Feature'].append(feature)
    feature_importance_dt_xgb_1991['DT_Importance'].append(feature_importance_dt_1991.get(feature, 0))
    feature_importance_dt_xgb_1991['XGB_Importance'].append(feature_importance_xgb_1991.get(feature, 0))

for feature in num_cols_2001 + cat_cols_2001:
    feature_importance_dt_xgb_2001['Feature'].append(feature)
    feature_importance_dt_xgb_2001['DT_Importance'].append(feature_importance_dt_2001.get(feature, 0))
    feature_importance_dt_xgb_2001['XGB_Importance'].append(feature_importance_xgb_2001.get(feature, 0))

df_feature_importance_1991 = pd.DataFrame(feature_importance_dt_xgb_1991)
df_feature_importance_2001 = pd.DataFrame(feature_importance_dt_xgb_2001)

df_feature_importance_1991.to_csv('feature_importance/feature_importance_1991.csv', index=False)
df_feature_importance_2001.to_csv('feature_importance/feature_importance_2001.csv', index=False)

predictions_dt_1991 = dt_model_1991.predict(X_test_1991)
accuracy_dt_1991 = accuracy_score(y_test_1991, predictions_dt_1991)
print(f"\nDecision Tree Accuracy for 1991 Dataset: {accuracy_dt_1991*100:.2f}%")

# Make predictions and evaluate for Decision Tree (2001 dataset)
predictions_dt_2001 = dt_model_2001.predict(X_test_2001)
accuracy_dt_2001 = accuracy_score(y_test_2001, predictions_dt_2001)
print(f"Decision Tree Accuracy for 2001 Dataset: {accuracy_dt_2001*100:.2f}%")

# Make predictions and evaluate for XGBoost (1991 dataset)
predictions_xgb_1991 = xgb_model_1991.predict(X_test_1991)
accuracy_xgb_1991 = accuracy_score(y_test_1991, predictions_xgb_1991)
print(f"\nXGBoost Accuracy for 1991 Dataset: {accuracy_xgb_1991*100:.2f}%")

# Make predictions and evaluate for XGBoost (2001 dataset)
predictions_xgb_2001 = xgb_model_2001.predict(X_test_2001)
accuracy_xgb_2001 = accuracy_score(y_test_2001, predictions_xgb_2001)
print(f"XGBoost Accuracy for 2001 Dataset: {accuracy_xgb_2001*100:.2f}%")

# Save accuracies as CSV
df_accuracies = pd.DataFrame({
    'Model': ['Decision Tree', 'XGBoost'],
    '1991 Accuracy': [accuracy_dt_1991, accuracy_xgb_1991],
    '2001 Accuracy': [accuracy_dt_2001, accuracy_xgb_2001]
})

df_accuracies.to_csv('accuracies/accuracies.csv', index=False)


Decision Tree Feature Importance for 1991 Dataset:
DepDelay: 24.10%
ActualElapsedTime: 16.75%
Distance: 15.85%
DepTime: 8.50%
Origin: 7.65%
Dest: 6.50%
FlightNum: 6.27%
Month: 5.81%
UniqueCarrier: 3.83%
DayofMonth: 3.02%
DayOfWeek: 1.72%
Cancelled: 0.00%
Diverted: 0.00%

Decision Tree Feature Importance for 2001 Dataset:
DepDelay: 35.04%
TaxiOut: 11.09%
Distance: 9.61%
ActualElapsedTime: 8.39%
DepTime: 5.59%
Origin: 4.88%
FlightNum: 4.69%
TaxiIn: 3.75%
Dest: 3.69%
AirTime: 3.44%
Month: 3.31%
DayofMonth: 2.86%
UniqueCarrier: 2.05%
DayOfWeek: 1.62%
Cancelled: 0.00%
Diverted: 0.00%

XGBoost Feature Importance for 1991 Dataset:
DepDelay: 54.37%
Distance: 10.80%
ActualElapsedTime: 9.65%
Origin: 5.86%
DayOfWeek: 4.76%
UniqueCarrier: 4.55%
Dest: 3.66%
Month: 2.49%
DepTime: 1.78%
FlightNum: 1.61%
DayofMonth: 0.47%
Cancelled: 0.00%
Diverted: 0.00%

XGBoost Feature Importance for 2001 Dataset:
DepDelay: 47.77%
TaxiOut: 18.29%
TaxiIn: 9.58%
Distance: 3.95%
AirTime: 3.83%
ActualElapsedTime: 3.59%