In [1]:
import pandas as pd

# Read the CSV file with a specified encoding
df = pd.read_csv("cleaned_flight_data.csv", encoding="ISO-8859-1")

# Display the first 5 rows
df.head()

  df = pd.read_csv("cleaned_flight_data.csv", encoding="ISO-8859-1")


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum
0,1998,1,1,2,5,1998-01-02,NW,19386,NW,N297US,...,,,,,,,,,,
1,2009,2,5,28,4,2009-05-28,FL,20437,FL,N946AT,...,,,,,,,,,,
2,2013,2,6,29,6,2013-06-29,MQ,20398,MQ,N665MQ,...,,,,,,,,,,
3,2010,3,8,31,2,2010-08-31,DL,19790,DL,N6705Y,...,,,,,,,,,,
4,2006,1,1,15,7,2006-01-15,US,20355,US,N504AU,...,,,,,,,,,,


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# --- Preprocessing ---
# Fill missing values for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [4]:
# Create binary target variable 'DelayCategory': 1 if there was a delay, 0 if there wasn't
df['DelayCategory'] = df.apply(lambda row: 1 if row['DepDelay'] > 0 or row['ArrDelay'] > 0 else 0, axis=1)

In [5]:
# Encode 'DelayCategory'
le = LabelEncoder()
df['DelayCategory'] = le.fit_transform(df['DelayCategory'])

In [6]:
# Feature engineering (time-related features)
df['DepTime_combined'] = df['DepTime'] // 100 * 60 + df['DepTime'] % 100
df['ArrTime_combined'] = df['ArrTime'] // 100 * 60 + df['ArrTime'] % 100

# Seasonal and time-of-day features
df['HighDelaySeason'] = df['Month'].apply(lambda x: 1 if x in [6, 7, 12] else 0)
df['LateNightFlight'] = df['DepTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)
df['LateArrivalFlight'] = df['ArrTime_combined'].apply(lambda x: 1 if 180 <= x % 1440 <= 240 else 0)

# Delay-related features
df['CarrierDelay'] = df['DepDelay'].apply(lambda x: 1 if x > 0 else 0)
df['WeatherDelay'] = df['ArrDelay'].apply(lambda x: 1 if x > 0 else 0)
df['ArrDelayFlag'] = df['ArrDelay'].apply(lambda x: 1 if x > 0 else 0)

df['WeatherDelayHigh'] = df['WeatherDelay'].apply(lambda x: 1 if x > 1250 else 0)
df['CarrierDelayMid'] = df['CarrierDelay'].apply(lambda x: 1 if 0 < x <= 250 else 0)

In [7]:
# Feature selection
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime_combined', 'ArrTime_combined', 
            'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 
            'HighDelaySeason', 'LateNightFlight', 'LateArrivalFlight', 'WeatherDelayHigh', 'CarrierDelayMid']

In [8]:
# Prepare feature matrix and target vector
X = df[features]
y = df['DelayCategory']

In [9]:
# Label encode any object/categorical columns (just in case)
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Fill any remaining missing values
X = X.fillna(X.median())

# --- Standardization (important for Logistic Regression) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
# --- Logistic Regression Model ---
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg.fit(X_train, y_train)

# --- Predictions and Evaluation ---
y_pred = logreg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print ("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.94    187866
           1       0.98      0.90      0.94    212134

    accuracy                           0.94    400000
   macro avg       0.94      0.94      0.94    400000
weighted avg       0.94      0.94      0.94    400000

Confusion Matrix: 
[[183032   4834]
 [ 20468 191666]]
Accuracy Score: 0.936745
