In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Read the flights data
flights = pd.read_csv('data/flights.csv')

In [None]:
#Checking what we have
flights.head()

In [3]:
# Convert individual date components to datetime
flights['DateTime'] = pd.to_datetime(
    {
        'year': flights['Year'],
        'month': flights['Month'],
        'day': flights['DayofMonth']
    }
)

# Format the datetime to MM-DD-YYYY
#flights['DateTime'] = flights['DateTime'].dt.strftime('%m-%d-%Y')
flights['DateTime'] = flights['DateTime'].dt.strftime('%Y-%m-%d')

In [None]:
# Get list of columns in flights
columns = flights.columns.tolist()
print(columns)

In [None]:
#checking again for some reason i cant recall
flights.head()

In [6]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder and encode Carrier

# Create and fit label encoder for Carrier
carrier_encoder = LabelEncoder()
flights['Carrier_Encoded'] = carrier_encoder.fit_transform(flights['Carrier'])


In [None]:
flights.head()

In [None]:
# Check for missing values
print("Missing values in each column:")
print(flights.isnull().sum())
print("\nTotal missing values:", flights.isnull().sum().sum())

# Check for duplicates
print("\nDuplicate rows:", flights.duplicated().sum())

In [9]:
flights['DepDel15'] = flights['DepDel15'].fillna(0)

In [None]:
# Check for missing values
print("Missing values in each column:")
print(flights.isnull().sum())
print("\nTotal missing values:", flights.isnull().sum().sum())

In [11]:
# Calculate 90th percentile for DepDelay
percentile_90 = flights['DepDelay'].quantile(0.90)

# Remove rows where DepDelay is higher than 90th percentile
flights = flights[flights['DepDelay'] <= percentile_90]

In [12]:
#Dont need them in the model
flights = flights.drop(['OriginAirportName', 'DestAirportName'], axis=1)

In [None]:
flights.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Select features and target
X = flights[['Carrier_Encoded','DayofMonth','DayOfWeek','Month','DestAirportID','OriginAirportID']]
#airport id
#YEAR-MONTH-DAY

y = flights['ArrDel15']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Print model accuracy
print(f"Training accuracy: {model.score(X_train_scaled, y_train):.5f}")
print(f"Testing accuracy: {model.score(X_test_scaled, y_test):.5f}")

In [None]:
# Get predictions probabilities for all test cases
y_test_probs = model.predict_proba(X_test_scaled)

# Pick a random index from test set
random_idx = np.random.randint(len(X_test))

print(len(X_test))

# Get the probabilities for that sample
sample_prob = y_test_probs[random_idx]
print(sample_prob)
# Get the actual test case details
test_case = X_test.iloc[random_idx]

print("Flight Details:")
print("--------------")
print(f"Carrier (encoded): {test_case['Carrier_Encoded']}")
print(f"Day of Month: {test_case['DayofMonth']}")
print(f"Day of Week: {test_case['DayOfWeek']}")
print(f"Month: {test_case['Month']}")
print(f"Origin Airport ID: {test_case['OriginAirportID']}")
print(f"Destination Airport ID: {test_case['DestAirportID']}")
print("\nPrediction Probabilities:")
print("-----------------------")
print(f"Probability of On-Time: {sample_prob[0]:.2%}")
print(f"Probability of Delay: {sample_prob[1]:.2%}")
print(f"\nActual Outcome: {'Delayed' if y_test.iloc[random_idx] == 1 else 'On-Time'}")

In [16]:
import pickle

# Save the model to a file
with open('fd_model_logistric_regression.pkl', 'wb') as file:
    pickle.dump(model, file)

# Also save the scaler since we'll need it for future predictions
with open('fd_model_logistric_regression_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [None]:
# Create a pie chart of average prediction probabilities
avg_probs = y_test_probs[random_idx]

print(avg_probs)
plt.figure(figsize=(8, 8))
plt.pie(avg_probs, labels=['On-Time', 'Delayed'], autopct='%1.1f%%')
plt.title('Average Probability Distribution of Flight Delays')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Bar plot of delay frequency by airport
plt.figure(figsize=(12, 6))
sns.countplot(data=flights, x='DestAirportID', hue='ArrDel15')
plt.title('Delay Frequency by Airport')
plt.xticks(rotation=45)
plt.show()

# 2. Time series of delays
plt.figure(figsize=(15, 6))
flights['Date'] = pd.to_datetime(flights['DateTime']).dt.date
daily_delays = flights.groupby('Date')['ArrDel15'].mean()
plt.plot(daily_delays.index, daily_delays.values)
plt.title('Daily Delay Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Delay Rate')
plt.show()

# 3. Heatmap of delays by hour and day
plt.figure(figsize=(10, 6))
flights['Hour'] = pd.to_datetime(flights['DateTime']).dt.hour
flights['DayOfWeek'] = pd.to_datetime(flights['DateTime']).dt.dayofweek
delay_pivot = flights.pivot_table(
    values='ArrDel15', 
    index='Hour',
    columns='DayOfWeek',
    aggfunc='mean'
)
sns.heatmap(delay_pivot, cmap='YlOrRd')
plt.title('Delay Probability by Hour and Day of Week')
plt.show()

## Now with Neural Networks

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn

# Custom dataset class
class FlightDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y.values)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Define the neural network
class FlightNet(nn.Module):
    def __init__(self, input_size):
        super(FlightNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.layers(x)

# Create datasets
train_dataset = FlightDataset(X_train_scaled, y_train)
test_dataset = FlightDataset(X_test_scaled, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Initialize model, loss and optimizer
model = FlightNet(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    # Print epoch statistics
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted.squeeze() == labels).sum().item()
    
    print(f'Test Accuracy: {100 * correct / total:.2f}%')

In [None]:
# Save the PyTorch model
torch.save(model.state_dict(), 'fd_model_neural_network.pt')
