## Import dependencies

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

## Connect to database and import data

In [2]:
# Connect to Flights_DB

db_path = Path('Resources/Flights_DB.sqlite')
engine = create_engine(f'sqlite:///{db_path}')

conn = engine.connect()

In [10]:
# Load Flights table, join Cancelled_Flights and Delayed_Flights for reasons

flights_df = pd.read_sql_query('''
                                  SELECT F.Operating_Airline
                                      ,F.Origin_Airport
                                      ,F.Destination_Airport
                                      ,F.Flight_Date
                                      ,F.Scheduled_Arrival_Time
                                      ,F.Scheduled_Departure_Time
                                      ,D.Carrier_Delay_Minutes
                                      ,D.Weather_Delay_Minutes
                                      ,D.NAS_Delay_Minutes
                                      ,D.Security_Delay_Minutes
                                      ,D.Late_Aircraft_Delay_Minutes
                                      ,C.Cancellation_Code
                                      ,F.Target
                                  FROM Flights F
                                  LEFT JOIN Cancelled_Flights C ON F.ID = C.ID
                                  LEFT JOIN Delayed_Flights D ON F.ID = D.ID
                               ''',
                               con=engine,
                               dtype={
                                   'Operating_Airline': object,
                                   'Origin_Airport': object,
                                   'Destination_Airport': object,
                                   'Flight_Date': np.datetime64,
                                   'Scheduled_Arrival_Time': np.datetime64,
                                   'Scheduled_Departure_Time': np.datetime64,
                                   'Carrier_Delay_Minutes': np.timedelta64,
                                   'Weather_Delay_Minutes': np.timedelta64,
                                   'NAS_Delay_Minutes': np.timedelta64,
                                   'Security_Delay_Minutes': np.timedelta64,
                                   'Late_Aircraft_Delay_Minutes': np.timedelta64,
                                   'Cancellation_Code': object,
                                   'Target': np.int32
                               })

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target
0,DL,FLL,LGA,2022-01-06,2023-02-14 14:19:00,2023-02-14 11:26:00,NaT,NaT,NaT,NaT,NaT,A,2
1,DL,ATL,FLL,2022-01-06,2023-02-14 18:21:00,2023-02-14 16:31:00,NaT,NaT,NaT,NaT,NaT,,0
2,DL,FLL,ATL,2022-01-06,2023-02-14 21:27:00,2023-02-14 19:31:00,NaT,NaT,NaT,NaT,NaT,,0
3,DL,FLL,RDU,2022-01-06,2023-02-14 12:27:00,2023-02-14 10:24:00,NaT,NaT,NaT,NaT,NaT,,0
4,DL,ATL,JAN,2022-01-06,2023-02-14 11:42:00,2023-02-14 11:17:00,NaT,NaT,NaT,NaT,NaT,,0
5,DL,JAN,ATL,2022-01-06,2023-02-14 14:58:00,2023-02-14 12:37:00,NaT,NaT,NaT,NaT,NaT,,0
6,DL,RIC,ATL,2022-01-06,2023-02-14 10:50:00,2023-02-14 09:00:00,NaT,NaT,NaT,NaT,NaT,,0
7,DL,MSP,RSW,2022-01-06,2023-02-14 14:33:00,2023-02-14 10:00:00,0 days,0 days,0 days,0 days,0 days,,1
8,DL,ATL,RDU,2022-01-06,2023-02-14 15:33:00,2023-02-14 14:14:00,0 days,0 days,0 days,0 days,0 days,,1
9,DL,RDU,ATL,2022-01-06,2023-02-14 18:14:00,2023-02-14 16:43:00,NaT,NaT,NaT,NaT,NaT,,0


## Preprocessing
1) Split Flight_Date into Month/Day - Day of week?
2) Fix Arr/Dep times, timedeltas
3) Fill NaTs/Nones
4) Make copy of DF and look into combining delay/cancellation reasons