In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
import numpy as np

In [9]:
# Load both datasets to inspect their structures and identify opportunities for feature engineering
flight_test_df = pd.read_csv('../data/final_test_processed_df.csv')
flight_history_df = pd.read_csv('../data/final_processed_history_df.csv')

# Displaying the first few rows of each dataframe to understand their structure
flight_history_df.shape

(161057, 21)

In [336]:
# Displaying the first few rows of each dataframe to understand their structure
features_to_drop=['uniqueid','tailnum','time_hour']

flight_test_df = flight_test_df.drop(features_to_drop,axis=1)

flight_test_df.head()

print(f"shape of dataframe {flight_test_df.shape}")

shape of dataframe (5786, 12)


In [337]:
flight_test_df.head()

Unnamed: 0,year,month,day,sched_dep_time,sched_arr_time,carrier,flight,origin,dest,hour,minute,lateflight
0,2013,7,14,1440,1749,DL,1902,LGA,PBI,14,40,0
1,2013,7,7,945,1305,AA,1871,LGA,MIA,9,45,0
2,2013,7,5,1200,1435,DL,1947,LGA,ATL,12,0,0
3,2013,7,22,650,808,EV,5811,EWR,BUF,6,50,1
4,2013,7,2,1630,1930,AA,881,JFK,DFW,16,30,1


### Feature enginnering

#### Feature - 1 Carrier_dep_delay_rate - Percentage

In [10]:
# Calculate the number of delayed flights for each carrier
num_delayed_flights = flight_history_df[flight_history_df['dep_delay'] > 0].groupby('carrier').size()

# Calculate the total number of flights for each carrier
total_flights = flight_history_df.groupby('carrier').size()

# Calculate the delay rate as a percentage
carrier_dep_delay_rate_percentage = (num_delayed_flights / total_flights * 100).round(2).to_dict()


flight_test_df['carrier_dep_delay_rate'] = flight_test_df['carrier'].map(carrier_dep_delay_rate_percentage)

In [11]:
num_delayed_flights = flight_history_df[flight_history_df['arr_delay'] > 0].groupby('carrier').size()

total_flights = flight_history_df.groupby('carrier').size()

# Calculate the delay rate as a percentage
carrier_arr_delay_rate_percentage = (num_delayed_flights / total_flights * 100).round(2).to_dict()


flight_test_df['carrier_arr_delay_rate'] = flight_test_df['carrier'].map(carrier_arr_delay_rate_percentage)

In [12]:
def time_of_day(hour):
    if 5 <= hour <= 10:
        return 'Morning'
    elif 11 <= hour <= 16:
        return 'Afternoon'
    elif 17 <= hour <= 21:
        return 'Evening'
    else:
        return 'Night'
flight_test_df['time_of_day'] = flight_test_df['hour'].apply(time_of_day)
flight_history_df['time_of_day'] = flight_history_df['hour'].apply(time_of_day)

In [13]:
flight_test_df['date'] = pd.to_datetime(flight_test_df[['year', 'month', 'day']])
flight_test_df['is_weekend'] = flight_test_df['date'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

In [14]:
def map_org_dest_rate(delay_rate_dict,row):
    return delay_rate_dict.get((row['origin'], row['dest']), None)
orig_dest_distance = flight_history_df.groupby(['origin','dest'])['distance'].size().to_dict()
flight_test_df['distance'] = flight_history_df.apply(lambda x :map_org_dest_rate(orig_dest_distance,x), axis=1)


#### Carrier based on-time arrival ratio

In [15]:
on_time_arrival_ratio = flight_history_df[flight_history_df['arr_delay'] <= 0].groupby('carrier').size() / flight_history_df.groupby('carrier').size()
on_time_arrival_ratio = on_time_arrival_ratio.to_dict()

# Map this ratio to the test dataset
flight_test_df['on_time_arrival_ratio'] = flight_test_df['carrier'].map(on_time_arrival_ratio)

#### Average departure delay at Origin

In [16]:
origin_dep_delay = flight_history_df.groupby('origin')['dep_delay'].mean().to_dict()
flight_test_df['origin_dep_delay_rate'] = flight_test_df['origin'].map(origin_dep_delay)

#### Average arrival delay at Destination

In [17]:
dest_arr_delay = flight_history_df.groupby('dest')['arr_delay'].mean().to_dict()
flight_test_df['origin_arr_delay_rate'] = flight_test_df['dest'].map(dest_arr_delay)

In [18]:
june_flights = flight_history_df[flight_history_df['month'] == 6]
june_delayed_flights = june_flights[june_flights['arr_delay'] > 15].groupby('flight')['arr_delay'].count().reset_index(name='june_delay_count')


In [19]:
flight_test_df = pd.merge(flight_test_df, june_delayed_flights, on='flight', how='left')


In [20]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

flight_test_df['time_of_day_encoded'] = encoder.fit_transform(flight_test_df['time_of_day'])

# Display the modified DataFrame



In [311]:
flight_test_df[['carrier_arr_delay_rate','lateflight']].corr()

Unnamed: 0,carrier_arr_delay_rate,lateflight
carrier_arr_delay_rate,1.0,0.091708
lateflight,0.091708,1.0


In [21]:
flight_history_df['total_delay'] = flight_history_df['arr_delay'] + flight_history_df['dep_delay']

In [22]:
def map_org_dest_rate(delay_rate_dict,row):
    return delay_rate_dict.get((row['origin'], row['dest'],row['carrier']),None )
total_delays_by_pair = flight_history_df.groupby(['origin','dest','carrier'])['total_delay'].mean().to_dict()




In [23]:
flight_history_df.columns

Index(['year', 'month', 'day', 'dep_time', 'sched_dep_time', 'dep_delay',
       'arr_time', 'sched_arr_time', 'arr_delay', 'carrier', 'flight',
       'tailnum', 'origin', 'dest', 'air_time', 'distance', 'hour', 'minute',
       'time_hour', 'is_ArrDelay', 'is_DepDelay', 'time_of_day',
       'total_delay'],
      dtype='object')

In [24]:
flight_test_df['total_delay_rate'] = flight_history_df.apply(lambda x :map_org_dest_rate(total_delays_by_pair,x), axis=1)


In [25]:
flight_test_df.head()

Unnamed: 0,uniqueid,year,month,day,sched_dep_time,sched_arr_time,carrier,flight,tailnum,origin,...,time_of_day,date,is_weekend,distance,on_time_arrival_ratio,origin_dep_delay_rate,origin_arr_delay_rate,june_delay_count,time_of_day_encoded,total_delay_rate
0,1,2013,7,14,1440,1749,DL,1902,N965DL,LGA,...,Afternoon,2013-07-14,1,1929,0.653248,10.793939,10.405358,9.0,0,17.657854
1,2,2013,7,7,945,1305,AA,1871,N3AJAA,LGA,...,Morning,2013-07-07,1,1464,0.65634,10.793939,3.842847,5.0,2,12.41735
2,3,2013,7,5,1200,1435,DL,1947,N608DA,LGA,...,Afternoon,2013-07-05,0,1625,0.653248,10.793939,10.631522,7.0,0,8.20864
3,4,2013,7,22,650,808,EV,5811,N16919,EWR,...,Morning,2013-07-22,0,307,0.465802,16.890126,10.472865,,2,12.439739
4,5,2013,7,2,1630,1930,AA,881,N3FCAA,JFK,...,Afternoon,2013-07-02,0,5081,0.65634,12.703958,2.747318,9.0,0,13.805462


In [26]:
flight_test_df[['total_delay_rate','lateflight']].corr()

Unnamed: 0,total_delay_rate,lateflight
total_delay_rate,1.0,-0.004073
lateflight,-0.004073,1.0


In [27]:
delay_rates = flight_history_df.groupby(['time_of_day', 'origin', 'dest']).agg(
    arr_delay_pct=('arr_delay', 'mean'),
    dep_delay_pct=('dep_delay', 'mean')
).reset_index()
# delay_rates['arr_delay_pct'] *= 100
# delay_rates['dep_delay_pct'] *= 100

In [28]:
delay_rates

Unnamed: 0,time_of_day,origin,dest,arr_delay_pct,dep_delay_pct
0,Afternoon,EWR,ALB,13.279070,23.015504
1,Afternoon,EWR,ATL,17.032753,18.509724
2,Afternoon,EWR,AUS,0.329341,10.143713
3,Afternoon,EWR,AVL,-5.000000,-2.250000
4,Afternoon,EWR,BDL,14.112782,23.744361
...,...,...,...,...,...
538,Night,JFK,SYR,17.761628,20.813953
539,Night,LGA,BTV,5.000000,10.000000
540,Night,LGA,MHT,40.416667,49.583333
541,Night,LGA,PWM,4.812500,17.031250


In [29]:
flight_test_df = pd.merge(
    flight_test_df,
    delay_rates,
    how='left',  # Use 'left' to keep all rows from other_df
    on=['time_of_day', 'origin', 'dest']
)

In [30]:
# round the values to 2 decimal places for the columns
flight_test_df[['on_time_arrival_ratio','origin_dep_delay_rate','origin_arr_delay_rate','june_delay_count','total_delay_rate','arr_delay_pct','dep_delay_pct']] = flight_test_df[['on_time_arrival_ratio','origin_dep_delay_rate','origin_arr_delay_rate','june_delay_count','total_delay_rate','arr_delay_pct','dep_delay_pct']].round(2)

In [31]:
flight_test_df.head()

Unnamed: 0,uniqueid,year,month,day,sched_dep_time,sched_arr_time,carrier,flight,tailnum,origin,...,is_weekend,distance,on_time_arrival_ratio,origin_dep_delay_rate,origin_arr_delay_rate,june_delay_count,time_of_day_encoded,total_delay_rate,arr_delay_pct,dep_delay_pct
0,1,2013,7,14,1440,1749,DL,1902,N965DL,LGA,...,1,1929,0.65,10.79,10.41,9.0,0,17.66,16.94,17.99
1,2,2013,7,7,945,1305,AA,1871,N3AJAA,LGA,...,1,1464,0.66,10.79,3.84,5.0,2,12.42,-3.34,0.4
2,3,2013,7,5,1200,1435,DL,1947,N608DA,LGA,...,0,1625,0.65,10.79,10.63,7.0,0,8.21,12.22,10.69
3,4,2013,7,22,650,808,EV,5811,N16919,EWR,...,0,307,0.47,16.89,10.47,,2,12.44,4.23,11.32
4,5,2013,7,2,1630,1930,AA,881,N3FCAA,JFK,...,0,5081,0.66,12.7,2.75,9.0,0,13.81,5.47,12.98


In [32]:
flight_test_df.fillna(0,inplace=True)

In [41]:
# features = ['carrier_dep_delay_rate','carrier_arr_delay_rate','time_of_day',
#             'is_weekend','on_time_arrival_ratio','carrier','dest', 'origin','distance','origin_dep_delay_rate',
#             'origin_arr_delay_rate','june_delay_count','total_delay_rate','arr_delay_pct','dep_delay_pct']
features = ['carrier_dep_delay_rate','carrier_arr_delay_rate','june_delay_count','time_of_day_encoded']
target = 'lateflight'

X = flight_test_df[features]
y = flight_test_df[target] # Temporary fill NA for demonstration


print(X.shape)
print(y.shape)

# Encoding categorical features and scaling numerical features
categorical_features = ['time_of_day']
# numerical_features = ['carrier_dep_delay_rate', 'carrier_arr_delay_rate',
#                       'on_time_arrival_ratio','june_delay_count','total_delay_rate','dep_delay_pct','dep_delay_pct']
numerical_features = ['carrier_dep_delay_rate','carrier_arr_delay_rate','june_delay_count']
# Updated preprocessor without LabelEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        # ('cat', OneHotEncoder(), categorical_features)
    ])
# Assuming preprocessor is defined elsewhere
models = {
    "Logistic": Pipeline([("preprocessor", preprocessor), ("classifier", LogisticRegression())]),
    "Random Forest": Pipeline([("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=42))]),
    "XGBoost": Pipeline([("preprocessor", preprocessor), ("classifier", XGBClassifier(random_state=42, n_estimators=300, max_depth=5, subsample=0.5, learning_rate=0.01))]),
    "Decision Tree": Pipeline([("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier(random_state=42))])
}

# Dataframe to store results
results_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1"])

for name, pipeline in models.items():
    print(name)
    # Lists to store metrics for each iteration
    accuracies, precisions, recalls, f1_scores = [], [], [], []
    
    for i in range(10):  # Looping through different splits to test robustness
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=i)
        
        # Fit model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        print(accuracy_score(y_test, y_pred) , i)
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
    
    # Calculate average metrics
    max_accuracy = max(accuracies)
    max_precision = max(precisions)
    max_recall = max(recalls)
    max_f1 = max(f1_scores)
    
    # Store results in DataFrame
    temp_df = pd.DataFrame({
        "Model": [name], 
        "Accuracy": [max_accuracy], 
        "Precision": [max_precision], 
        "Recall": [max_recall], 
        "F1": [max_f1]
    })

    results_df = pd.concat([results_df, temp_df], ignore_index=True)
# Display results
print(results_df)


(5786, 4)
(5786,)
Logistic


AttributeError: 'LogisticRegression' object has no attribute 'best_estimators_'

In [39]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic,0.580311,0.575419,0.381481,0.458797
1,Random Forest,0.607945,0.597285,0.522222,0.537678
2,XGBoost,0.613126,0.606481,0.503704,0.539095
3,Decision Tree,0.607945,0.602871,0.485185,0.526096


In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
# Splitting the dataset
classifiers = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [5, 10, 15, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42),
        'params': {
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear', 'saga']
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 8],
            'classifier__min_child_weight': [1, 3, 5]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'classifier__max_depth': [None, 5, 10, 15],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    }
}


# Iterate over classifiers, perform hyperparameter tuning, and evaluate each model
results = []
feature_importances = {}

for classifier_name, classifier_dict in classifiers.items():
    print(classifier_dict['model'])
    # Create a pipeline with the preprocessor and the classifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier_dict['model'])
    ])
    
    # Set up the GridSearchCV
    grid_search = GridSearchCV(model_pipeline, classifier_dict['params'], cv=5, scoring='accuracy')
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Make predictions
    y_pred = grid_search.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results.append({
        'Model': classifier_name,
        'Best Params': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })


    # Display results for each model
for result in results:
    print(result)

    # Optionally, print or plot feature importances for models that support it


RandomForestClassifier(random_state=42)
LogisticRegression(random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
DecisionTreeClassifier(random_state=42)
{'Model': 'Random Forest', 'Best Params': {'classifier__max_depth': 5, 'classifier__min_samples_leaf': 4, 'classifier__min_