In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the dataset
data = pd.read_csv('data/flights.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,OriginAirportName,OriginCity,OriginState,DestAirportID,DestAirportName,DestCity,DestState,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,9,16,1,DL,15304,Tampa International,Tampa,FL,12478,John F. Kennedy International,New York,NY,1539,4,0.0,1824,13,0,0
1,2013,9,23,1,WN,14122,Pittsburgh International,Pittsburgh,PA,13232,Chicago Midway International,Chicago,IL,710,3,0.0,740,22,1,0
2,2013,9,7,6,AS,14747,Seattle/Tacoma International,Seattle,WA,11278,Ronald Reagan Washington National,Washington,DC,810,-3,0.0,1614,-7,0,0
3,2013,7,22,1,OO,13930,Chicago O'Hare International,Chicago,IL,11042,Cleveland-Hopkins International,Cleveland,OH,804,35,1.0,1027,33,1,0
4,2013,5,16,4,DL,13931,Norfolk International,Norfolk,VA,10397,Hartsfield-Jackson Atlanta International,Atlanta,GA,545,-1,0.0,728,-9,0,0


In [None]:
# clean the data by identifying empty or null values and replacing with zero values
data = data.fillna(0)


In [8]:
# Select relevant columns
features = ['DayOfWeek', 'OriginAirportID', 'DestAirportID']
target = 'DepDel15'

# Drop rows with missing target values
data = data.dropna(subset=[target])

# Handle missing values in features if any
data = data.dropna(subset=features)

# Extract features and target variable
X = data[features]
y = data[target]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.7876885355524185
              precision    recall  f1-score   support

         0.0       0.80      0.98      0.88     42946
         1.0       0.30      0.04      0.06     10890

    accuracy                           0.79     53836
   macro avg       0.55      0.51      0.47     53836
weighted avg       0.70      0.79      0.72     53836



In [11]:
# save the model to a file for use in an external application
import joblib
joblib.dump(model, 'flight_delay_model.pkl')
print("Model saved as flight_delay_model.pkl")


Model saved as flight_delay_model.pkl


In [14]:
# create a list with all airports names and ids
origin_airports = data[['OriginAirportID', 'OriginAirportName']].drop_duplicates().values
dest_airports = data[['DestAirportID', 'DestAirportName']].drop_duplicates().values


# add the two arrays with new column names airport_id and airport_name
airports = np.concatenate([origin_airports, dest_airports])

# convert the array to a dataframe
airports_df = pd.DataFrame(airports, columns=['airport_id', 'airport_name']).drop_duplicates()

airports_df.head()

Unnamed: 0,airport_id,airport_name
0,15304,Tampa International
1,14122,Pittsburgh International
2,14747,Seattle/Tacoma International
3,13930,Chicago O'Hare International
4,13931,Norfolk International


In [15]:
# save the airports dataframe to a csv file
airports_df.to_csv('data/airports.csv', index=False)