In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the flights data
flights = pd.read_csv('data/flights.csv')

In [2]:
flights.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,OriginAirportName,OriginCity,OriginState,DestAirportID,DestAirportName,DestCity,DestState,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,9,16,1,DL,15304,Tampa International,Tampa,FL,12478,John F. Kennedy International,New York,NY,1539,4,0.0,1824,13,0,0
1,2013,9,23,1,WN,14122,Pittsburgh International,Pittsburgh,PA,13232,Chicago Midway International,Chicago,IL,710,3,0.0,740,22,1,0
2,2013,9,7,6,AS,14747,Seattle/Tacoma International,Seattle,WA,11278,Ronald Reagan Washington National,Washington,DC,810,-3,0.0,1614,-7,0,0
3,2013,7,22,1,OO,13930,Chicago O'Hare International,Chicago,IL,11042,Cleveland-Hopkins International,Cleveland,OH,804,35,1.0,1027,33,1,0
4,2013,5,16,4,DL,13931,Norfolk International,Norfolk,VA,10397,Hartsfield-Jackson Atlanta International,Atlanta,GA,545,-1,0.0,728,-9,0,0


In [3]:
# Convert individual date components to datetime
flights['DateTime'] = pd.to_datetime(
    {
        'year': flights['Year'],
        'month': flights['Month'],
        'day': flights['DayofMonth']
    }
)

# Format the datetime to MM-DD-YYYY
flights['DateTime'] = flights['DateTime'].dt.strftime('%m-%d-%Y')

In [4]:
# Get list of columns in flights
columns = flights.columns.tolist()
print(columns)

['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Carrier', 'OriginAirportID', 'OriginAirportName', 'OriginCity', 'OriginState', 'DestAirportID', 'DestAirportName', 'DestCity', 'DestState', 'CRSDepTime', 'DepDelay', 'DepDel15', 'CRSArrTime', 'ArrDelay', 'ArrDel15', 'Cancelled', 'DateTime']


In [5]:
flights.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,OriginAirportName,OriginCity,OriginState,DestAirportID,...,DestCity,DestState,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled,DateTime
0,2013,9,16,1,DL,15304,Tampa International,Tampa,FL,12478,...,New York,NY,1539,4,0.0,1824,13,0,0,09-16-2013
1,2013,9,23,1,WN,14122,Pittsburgh International,Pittsburgh,PA,13232,...,Chicago,IL,710,3,0.0,740,22,1,0,09-23-2013
2,2013,9,7,6,AS,14747,Seattle/Tacoma International,Seattle,WA,11278,...,Washington,DC,810,-3,0.0,1614,-7,0,0,09-07-2013
3,2013,7,22,1,OO,13930,Chicago O'Hare International,Chicago,IL,11042,...,Cleveland,OH,804,35,1.0,1027,33,1,0,07-22-2013
4,2013,5,16,4,DL,13931,Norfolk International,Norfolk,VA,10397,...,Atlanta,GA,545,-1,0.0,728,-9,0,0,05-16-2013


In [6]:
# Check for missing values
print("Missing values in each column:")
print(flights.isnull().sum())
print("\nTotal missing values:", flights.isnull().sum().sum())

# Check for duplicates
print("\nDuplicate rows:", flights.duplicated().sum())

Missing values in each column:
Year                    0
Month                   0
DayofMonth              0
DayOfWeek               0
Carrier                 0
OriginAirportID         0
OriginAirportName       0
OriginCity              0
OriginState             0
DestAirportID           0
DestAirportName         0
DestCity                0
DestState               0
CRSDepTime              0
DepDelay                0
DepDel15             2761
CRSArrTime              0
ArrDelay                0
ArrDel15                0
Cancelled               0
DateTime                0
dtype: int64

Total missing values: 2761

Duplicate rows: 0


In [7]:
flights['DepDel15'] = flights['DepDel15'].fillna(0)

In [8]:
# Check for missing values
print("Missing values in each column:")
print(flights.isnull().sum())
print("\nTotal missing values:", flights.isnull().sum().sum())

Missing values in each column:
Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
Carrier              0
OriginAirportID      0
OriginAirportName    0
OriginCity           0
OriginState          0
DestAirportID        0
DestAirportName      0
DestCity             0
DestState            0
CRSDepTime           0
DepDelay             0
DepDel15             0
CRSArrTime           0
ArrDelay             0
ArrDel15             0
Cancelled            0
DateTime             0
dtype: int64

Total missing values: 0


In [9]:
# Calculate 90th percentile for DepDelay
percentile_90 = flights['DepDelay'].quantile(0.90)

# Remove rows where DepDelay is higher than 90th percentile
flights = flights[flights['DepDelay'] <= percentile_90]

In [10]:
flights = flights.drop(['OriginAirportName', 'DestAirportName'], axis=1)

In [11]:
flights.shape

(245192, 19)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Select features and target
X = flights[['DayOfWeek', 'DestAirportID']]
#airport id
#YEAR-MONTH-DAY

y = flights['ArrDel15']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Print model accuracy
print(f"Training accuracy: {model.score(X_train_scaled, y_train):.5f}")
print(f"Testing accuracy: {model.score(X_test_scaled, y_test):.5f}")

Training accuracy: 0.86835
Testing accuracy: 0.87049


In [13]:
import numpy as np

# Get a random sample from X_test
np.random.seed(30)  # for reproducibility
random_idx = np.random.choice(X_test.index)

# Get the feature values for this sample
sample = X_test.loc[random_idx].values.reshape(1, -1)
sample_scaled = scaler.transform(sample)

# Make prediction
prediction = model.predict_proba(sample_scaled)
print(f"Prediction probability for class 0 and 1: {prediction}")
print(prediction)


# Get the actual flight details from the flights DataFrame
flight_details = flights.loc[random_idx]

print(f"Flight Details:")
print(f"Date: {flight_details['DateTime']}")
print(f"Origin City: {flight_details['OriginCity']}, {flight_details['OriginState']}")
print(f"Destination City: {flight_details['DestCity']}, {flight_details['DestState']}")
print(f"Predicted Arrival Delay > 15 min: {prediction[0][1] > 0.5}")
print(f"Actual Arrival Delay > 15 min: {bool(flight_details['ArrDel15'])}")

Prediction probability for class 0 and 1: [[0.86849606 0.13150394]]
[[0.86849606 0.13150394]]
Flight Details:
Date: 07-11-2013
Origin City: Orlando, FL
Destination City: Washington, DC
Predicted Arrival Delay > 15 min: False
Actual Arrival Delay > 15 min: False




In [14]:
import pickle

# Save the model to a file
with open('flight_delay_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Also save the scaler since we'll need it for future predictions
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)