In [16]:
import pandas as pd

# Read the CSV file with a specified encoding
df = pd.read_csv("cleaned_flight_data.csv", encoding="ISO-8859-1")

# Display the first 5 rows
df.head()

  df = pd.read_csv("cleaned_flight_data.csv", encoding="ISO-8859-1")


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum
0,1998,1,1,2,5,1998-01-02,NW,19386,NW,N297US,...,,,,,,,,,,
1,2009,2,5,28,4,2009-05-28,FL,20437,FL,N946AT,...,,,,,,,,,,
2,2013,2,6,29,6,2013-06-29,MQ,20398,MQ,N665MQ,...,,,,,,,,,,
3,2010,3,8,31,2,2010-08-31,DL,19790,DL,N6705Y,...,,,,,,,,,,
4,2006,1,1,15,7,2006-01-15,US,20355,US,N504AU,...,,,,,,,,,,


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [19]:
# --- Preprocessing ---
# Fill missing values for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [20]:
# Create binary target variable 'DelayCategory': 1 if there was a delay, 0 if there wasn't
df['DelayCategory'] = df.apply(lambda row: 1 if row['DepDelay'] > 0 or row['ArrDelay'] > 0 else 0, axis=1)

In [21]:
# Encode 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

In [22]:
# Feature engineering (time-related features)
df['DepTime_combined'] = df['DepTime'] // 100 * 60 + df['DepTime'] % 100
df['ArrTime_combined'] = df['ArrTime'] // 100 * 60 + df['ArrTime'] % 100

# Time of day: 7 AM to 1 PM window for both departure and arrival
df['DayTimeFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 420 <= x <= 780 else 0)  # 7 AM to 1 PM
df['DayArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 420 <= x <= 780 else 0)  # 7 AM to 1 PM

# Seasonal and time-of-day features
# Updated months for seasonal delays: December, February, April, July, August
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x in [2, 4, 7, 8, 12] else 0)

# Create binary features based on TaxiIn and TaxiOut times
df['TaxiIn_Long'] = df['TaxiIn'].apply(lambda x: 1 if x > 10 else 0)
df['TaxiOut_Long'] = df['TaxiOut'].apply(lambda x: 1 if x > 10 else 0)

# Add the weekend feature (IsWeekend)
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x in [5, 6] else 0)  # 5 for Saturday, 6 for Sunday

# Add the airline delay flag based on specific airlines (e.g., 'PI', 'AAPS')
df['HighDelayAirline'] = df['IATA_CODE_Reporting_Airline'].apply(lambda x: 1 if x in ['PI', 'AA', 'PS'] else 0)

# Define the top states
high_delay_origin_states = ['CA', 'TX', 'IL', 'FL', 'GA', 'NY', 'CO', 'NC', 'PA', 'AZ']
high_delay_dest_states = ['CA', 'TX', 'FL', 'IL', 'GA', 'NY', 'NC', 'CO', 'PA', 'AZ']

# Create the features
df['HighDelayOriginState'] = df['OriginState'].apply(lambda x: 1 if x in high_delay_origin_states else 0)
df['HighDelayDestState'] = df['DestState'].apply(lambda x: 1 if x in high_delay_dest_states else 0)

# Features for training
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined',
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'DayTimeFlight', 'DayArrivalFlight', 'TaxiOut_Long', 'TaxiIn_Long', 'IsWeekend', 
            'HighDelayAirline', 'HighDelayOriginState', 'HighDelayDestState']

# Features matrix and target vector
X = df[features]
y = df['DelayCategory']

In [23]:
# Label encode any object/categorical columns (just in case)
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Fill any remaining missing values
X = X.fillna(X.median())

# --- Standardization (important for Logistic Regression) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [21]:
# Calculate correlation matrix
correlation_matrix = df[['TaxiIn', 'TaxiOut', 'DepDelay', 'ArrDelay']].corr()

# Display the correlation matrix
print(correlation_matrix)

            TaxiIn   TaxiOut  DepDelay  ArrDelay
TaxiIn    1.000000  0.049393  0.012460  0.066857
TaxiOut   0.049393  1.000000  0.073118  0.257508
DepDelay  0.012460  0.073118  1.000000  0.898746
ArrDelay  0.066857  0.257508  0.898746  1.000000


In [20]:
# Assuming 'TaxiIn' and 'TaxiOut' are columns in the dataset
print(df[['TaxiIn', 'TaxiOut']].head())  # Preview the first 5 rows of TaxiIn and TaxiOut data

   TaxiIn  TaxiOut
0     3.0     24.0
1     8.0     10.0
2     6.0      9.0
3     7.0     23.0
4     8.0     19.0


In [22]:
# Set a threshold for TaxiIn or TaxiOut (e.g., greater than 10 minutes)
threshold = 10

# Filter for rows where TaxiIn or TaxiOut is greater than the threshold
high_taxi_in = df[df['TaxiIn'] > threshold]
high_taxi_out = df[df['TaxiOut'] > threshold]

# Check the delays for these cases
high_taxi_in_delays = high_taxi_in[['TaxiIn', 'DepDelay', 'ArrDelay']]
high_taxi_out_delays = high_taxi_out[['TaxiOut', 'DepDelay', 'ArrDelay']]

# Calculate average delays for these cases
avg_delay_taxi_in = high_taxi_in_delays.mean()
avg_delay_taxi_out = high_taxi_out_delays.mean()

# Display the results
print("Average delays for flights with TaxiIn > {} minutes:".format(threshold))
print(avg_delay_taxi_in)

print("Average delays for flights with TaxiOut > {} minutes:".format(threshold))
print(avg_delay_taxi_out)

Average delays for flights with TaxiIn > 10 minutes:
TaxiIn      16.580352
DepDelay     9.038974
ArrDelay    12.262727
dtype: float64
Average delays for flights with TaxiOut > 10 minutes:
TaxiOut     17.257763
DepDelay     8.584457
ArrDelay     7.474424
dtype: float64


In [2]:
#pip install xgboost scikit-learn pandas

^C
Note: you may need to restart the kernel to use updated packages.


Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 3.6 MB/s eta 0:00:41
   ---------------------------------------- 1.6/150.0 MB 3.5 MB/s eta 0:00:43
    --------------------------------------- 2.1/150.0 MB 3.4 MB/s eta 0:00:44
    --------------------------------------- 2.6/150.0 MB 3.0 MB/s eta 0:00:50
    --------------------------------------- 2.9/150.0 MB 2.5 MB/s eta 0:00:59
    --------------------------------------- 2.9/150.0 MB 2.5 MB/s eta 0:00:59
    --------------------------------------- 3.1/150.0 MB 2.0 MB/s eta 0:01:14
    --------------------------------------- 3.1/150.0 MB 2.0 MB/s eta 0:01:14
    --------------------------------------- 3.1/150.0 MB 2.0 MB/s eta 0:01:14
    ---

In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

In [27]:
# XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [28]:
# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
print("\n--- XGBoost Classification Report ---")
print(classification_report(y_test, y_pred_xgb))
print("XGBoost Accuracy Score:", accuracy_score(y_test, y_pred_xgb))                                                                     
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


--- XGBoost Classification Report ---
              precision    recall  f1-score   support

           0       0.69      0.78      0.73    187866
           1       0.78      0.69      0.73    212134

    accuracy                           0.73    400000
   macro avg       0.73      0.73      0.73    400000
weighted avg       0.73      0.73      0.73    400000

XGBoost Accuracy Score: 0.7294275
XGBoost Confusion Matrix:
 [[145627  42239]
 [ 65990 146144]]
