# Import Required Libraries
Import the necessary libraries, including pandas, NumPy, and scikit-learn.

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib


# Load and Preprocess Data
Load the flight data from a CSV file, handle missing values, and perform initial data cleaning.

In [14]:
# Load the flight data from a CSV file
flight_data = pd.read_csv('data/flights.csv')

# Print the renamed columns to verify
print("Renamed columns in the dataset:")
print(flight_data.columns)

# Identify null values
print("\nNull values in the DataFrame:")
print(flight_data.isnull().sum())

# Replace null values with zero
# flight_data.fillna(0, inplace=True)
flight_data['DepDel15'].fillna(0, inplace=True)
flight_data['ArrDel15'].fillna(0, inplace=True)

# Create a binary target variable 'DELAYED' using  DepDelay15 or ArrDelay15
flight_data['DELAYED'] = np.where((flight_data['ArrDel15'] == 1), 1, 0)


Renamed columns in the dataset:
Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Carrier',
       'OriginAirportID', 'OriginAirportName', 'OriginCity', 'OriginState',
       'DestAirportID', 'DestAirportName', 'DestCity', 'DestState',
       'CRSDepTime', 'DepDelay', 'DepDel15', 'CRSArrTime', 'ArrDelay',
       'ArrDel15', 'Cancelled'],
      dtype='object')

Null values in the DataFrame:
Year                    0
Month                   0
DayofMonth              0
DayOfWeek               0
Carrier                 0
OriginAirportID         0
OriginAirportName       0
OriginCity              0
OriginState             0
DestAirportID           0
DestAirportName         0
DestCity                0
DestState               0
CRSDepTime              0
DepDelay                0
DepDel15             2761
CRSArrTime              0
ArrDelay                0
ArrDel15                0
Cancelled               0
dtype: int64


# Feature Engineering
Create new features that may help in predicting flight delays, such as day of the week, month, and airport pair.

In [15]:
x = flight_data[['DayOfWeek', 'DestAirportID']]
y = flight_data['DELAYED']

# Split Data into Training and Testing Sets
Split the dataset into training and testing sets to evaluate the model's performance.

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y)

# Display the shapes of the training and testing sets
print("\nShapes of the training and testing sets:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")


Shapes of the training and testing sets:
X_train: (203955, 2), X_test: (67985, 2)
y_train: (203955,), y_test: (67985,)


# Train the Model
Use a machine learning algorithm, such as logistic regression or random forest, to train the model on the training data.

In [17]:
# Train the Model
# Use a machine learning algorithm, such as logistic regression or random forest, to train the model on the training data.

# Create the model with valid feature names
# model = RandomForestClassifier(n_estimators=100, random_state=42)
model = LogisticRegression(solver='lbfgs')

# Train the model on the training data
model.fit(X_train, y_train)

# Calculate the accuracy of the model
accuracy = model.score(X_test, y_test)
# report = classification_report(y_test, y_pred)

# Display the accuracy and classification report
print(f"Model accuracy: {accuracy * 100:.2f}%")
# print("Classification Report:")
# print(report)

Model accuracy: 78.41%


Write the prediction model to file.

In [18]:
# Save the model to a file
#pickle.dump(model, open('data/model.pkl', 'wb'))
joblib.dump(model, 'server/flight_delay_model.pkl')
print("Model saved to 'server/flight_delay_model.pkl'")

Model saved to 'server/flight_delay_model.pkl'


In [19]:
# Load the flight data from the CSV file
flight_data = pd.read_csv('data/flights.csv')

# Extract unique airport IDs and names for origin and destination airports
origin_airports = flight_data[['OriginAirportID', 'OriginAirportName']].drop_duplicates()
dest_airports = flight_data[['DestAirportID', 'DestAirportName']].drop_duplicates()

# Rename columns for consistency
origin_airports.columns = ['AirportID', 'AirportName']
dest_airports.columns = ['AirportID', 'AirportName']

#combine origin and destination airports into one list with AirportID and AirportName only and remove duplicates
unique_airports = pd.concat([origin_airports, dest_airports]).drop_duplicates()

# Save the unique airport IDs and names to a new CSV file
unique_airports.to_csv('data/airports.csv', index=False)

print("Airports data has been saved to 'data/airports.csv'")

Airports data has been saved to 'data/airports.csv'
