# Retrieve Data from Database

In order to connect to the database, first, make sure:
1. …you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.
2. …the `airports` and `flights_and_weather` tables have data.

If the `airports` and `flights_and_weather` tables do not have data, you must first run `/download_and_prepare_datasets/02_prepare_and_store_data.ipynb` to populate the database.

In [1]:
import os
import time

In [2]:
os.chdir(os.path.join('..','resources'))

## Connect to Database

In [3]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is already in `cfg` (above). However, you will have to enter your password below.

In [4]:
password = getpass('Enter database password')

Enter database password········


In [5]:
start_time = time.time()

In [6]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [7]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

## Access Database Tables and Create Table Aliases

In [8]:
db_meta = db.MetaData()

In [9]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)
faw_table = db.Table('flights_and_weather', db_meta, autoload_with=engine)
faw = faw_table.alias('faw')
orig = airports_table.alias('orig')
dest = airports_table.alias('dest')

## Extract Full, Joined Dataset to Pandas Dataframe

In [10]:
from io import StringIO
import pandas as pd

In [11]:
# Function to extract from database query to Pandas dataframe
# Adapted from <https://towardsdatascience.com/optimizing-pandas-read-sql-for-postgres-f31cd7f707ab>
def read_sql_inmem(query, db_engine, **kwargs):
    copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
       query=query, head="HEADER"
    )
    conn = db_engine.raw_connection()
    cur = conn.cursor()
    store = StringIO()
    cur.copy_expert(copy_sql, store)
    store.seek(0)
    df = pd.read_csv(store, **kwargs)
    return df

In [12]:
# The query composed as an SQLAlchemy table object
get_all = (
    db.select([
        faw,
        orig.c.lat_decimal.label('origin_lat'),
        orig.c.lon_decimal.label('origin_lon'),
        dest.c.lat_decimal.label('destination_lat'),
        dest.c.lon_decimal.label('destination_lon')
    ])
    .select_from(
        faw
        .join(orig, orig.c.iata_code == faw.c.origin_airport, isouter=True)
        .join(dest, dest.c.iata_code == faw.c.destination_airport, isouter=True)
    )
    .order_by(faw.c.id)
)

In [13]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

In [14]:
def df_details(df):
    return pd.concat(
        [
            df.dtypes,
            df.isna().sum(),
            pd.Series(
                data=[df[col].dropna().is_unique for col in df.columns],
                index=df.columns
            )
        ],
        axis=1,
        keys=['data_type','null_count','unique']
    )

In [15]:
# Run the query, and save the result to a dataframe
combined_df = read_sql_inmem(
    query=get_all,
#     query=get_all.limit(10).compile(engine, compile_kwargs={"literal_binds": True}),
    db_engine=engine,
    index_col='id',
    converters={'cancelled':(lambda x: True if x == 't' else False)}
)
# .sort_index(kind='mergesort')

print_shape(combined_df)

5,468,069 rows × 35 columns


In [16]:
with pd.option_context('display.max_columns',None):
    display(combined_df.head())

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircarft_arrival,cancelled,scheduled_departure_dt,scheduled_arrival_dt,actual_departure_dt,actual_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,0,0,0,0,False,2019-05-01 00:40:00,2019-05-01 03:15:00,2019-05-01 00:32:00,2019-05-01 02:59:00,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,0,0,0,0,False,2019-05-01 00:59:00,2019-05-01 04:26:00,2019-05-01 01:16:00,2019-05-01 04:22:00,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,0,0,0,27,False,2019-05-01 00:50:00,2019-05-01 04:28:00,2019-05-01 01:34:00,2019-05-01 04:55:00,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,0,0,0,0,False,2019-05-01 00:55:00,2019-05-01 04:31:00,2019-05-01 01:19:00,2019-05-01 04:41:00,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,0,0,0,0,False,2019-05-01 00:10:00,2019-05-01 04:40:00,2019-05-01 00:01:00,2019-05-01 04:09:00,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [17]:
df_details(combined_df)

Unnamed: 0,data_type,null_count,unique
carrier_code,object,0,False
flight_number,int64,0,False
origin_airport,object,0,False
destination_airport,object,0,False
flight_date,object,0,False
scheduled_elapsed_time,int64,0,False
tail_number,object,6973,False
departure_delay,int64,0,False
arrival_delay,int64,0,False
delay_carrier,int64,0,False


In [18]:
print(f'Elapsed time: {(time.time() - start_time):,.3f} seconds.')

Elapsed time: 160.136 seconds.


In [19]:
combined_df['cancelled'].value_counts()

False    5426150
True       41919
Name: cancelled, dtype: int64

In [20]:
combined_df['scheduled_departure_dt']

id
0          2019-05-01 00:40:00
1          2019-05-01 00:59:00
2          2019-05-01 00:50:00
3          2019-05-01 00:55:00
4          2019-05-01 00:10:00
                  ...         
5512898    2019-12-31 16:53:00
5512899    2019-12-31 17:45:00
5512900    2019-12-31 19:59:00
5512901    2019-12-31 22:20:00
5512902    2019-12-31 22:45:00
Name: scheduled_departure_dt, Length: 5468069, dtype: object

In [21]:
combined_df.columns

Index(['carrier_code', 'flight_number', 'origin_airport',
       'destination_airport', 'flight_date', 'scheduled_elapsed_time',
       'tail_number', 'departure_delay', 'arrival_delay', 'delay_carrier',
       'delay_weather', 'delay_national_aviation_system', 'delay_security',
       'delay_late_aircarft_arrival', 'cancelled', 'scheduled_departure_dt',
       'scheduled_arrival_dt', 'actual_departure_dt', 'actual_arrival_dt',
       'station_x', 'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x', 'hourlywindspeed_x',
       'station_y', 'hourlydrybulbtemperature_y', 'hourlyprecipitation_y',
       'hourlystationpressure_y', 'hourlyvisibility_y', 'hourlywindspeed_y',
       'origin_lat', 'origin_lon', 'destination_lat', 'destination_lon'],
      dtype='object')

### 1.) origin_airport
### 2.) destination_airport
### 3.) departure_delay
### 4.) arrival_delay
### 5.) 'cancelled'
### 6.) station_x
### 7.) hourlydrybulbtemperature_x
### 8.) hourlyprecipitation_x
### 9.) hourlystationpressure_x
### 10.) 'hourlyvisibility_x', 
### 11.) 'hourlywindspeed_x',
### 12.) 'station_y', 
### 13.) 'hourlydrybulbtemperature_y', 
### 15.) 'hourlyprecipitation_y',
###   16.)  'hourlystationpressure_y',
### 17.) 'hourlyvisibility_y',
### 18.) 'hourlywindspeed_y',
### 19.) 'origin_lat', 
### 20 .) 'origin_lon', 
### 21.) 'destination_lat',
### 22.) 'destination_lon'

In [22]:
columns = ['origin_airport',
            'destination_airport',
            'departure_delay',
            'arrival_delay',
            'cancelled',
            'station_x',
            'hourlydrybulbtemperature_x',
            'hourlyprecipitation_x',
            'hourlystationpressure_x',
            'hourlyvisibility_x', 
            'hourlywindspeed_x',
            'station_y', 
            'hourlydrybulbtemperature_y', 
            'hourlyprecipitation_y',
            'hourlystationpressure_y',
            'hourlyvisibility_y',
            'hourlywindspeed_y',
            'origin_lat', 
            'origin_lon', 
            'destination_lat',
            'destination_lon']
columns

['origin_airport',
 'destination_airport',
 'departure_delay',
 'arrival_delay',
 'cancelled',
 'station_x',
 'hourlydrybulbtemperature_x',
 'hourlyprecipitation_x',
 'hourlystationpressure_x',
 'hourlyvisibility_x',
 'hourlywindspeed_x',
 'station_y',
 'hourlydrybulbtemperature_y',
 'hourlyprecipitation_y',
 'hourlystationpressure_y',
 'hourlyvisibility_y',
 'hourlywindspeed_y',
 'origin_lat',
 'origin_lon',
 'destination_lat',
 'destination_lon']

In [23]:
len(columns)

21

In [24]:
df = combined_df[columns]

In [25]:
df

Unnamed: 0_level_0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,SEA,ANC,-8,-16,False,7.279302e+10,49.0,0.0,29.59,10.0,...,7.027253e+10,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,LAX,DEN,17,-4,False,7.229502e+10,58.0,0.0,29.65,10.0,...,7.256500e+10,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,SFO,DEN,44,27,False,7.249402e+10,51.0,0.0,29.98,10.0,...,7.256500e+10,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,PDX,DEN,24,10,False,7.269802e+10,48.0,0.0,29.98,10.0,...,7.256500e+10,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,ANC,SEA,-9,-31,False,7.027253e+10,43.0,0.0,30.18,10.0,...,7.279302e+10,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5512898,ATL,HSV,-5,-16,False,7.221901e+10,54.0,0.0,28.89,10.0,...,7.232300e+10,37.0,0.0,29.38,10.0,3.0,33.640,-84.427,,
5512899,ATL,BHM,-6,-13,False,7.221901e+10,50.0,0.0,28.89,10.0,...,7.222801e+10,42.0,0.0,29.44,10.0,5.0,33.640,-84.427,33.563,-86.753
5512900,ATL,BHM,-5,-16,False,7.221901e+10,46.0,0.0,28.91,10.0,...,7.222801e+10,42.0,0.0,29.44,10.0,5.0,33.640,-84.427,33.563,-86.753
5512901,ATL,BHM,-7,-16,False,7.221901e+10,44.0,0.0,28.93,10.0,...,7.222801e+10,42.0,0.0,29.44,10.0,5.0,33.640,-84.427,33.563,-86.753


In [26]:
df['cancelled'].value_counts()

False    5426150
True       41919
Name: cancelled, dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5468069 entries, 0 to 5512902
Data columns (total 21 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   origin_airport              object 
 1   destination_airport         object 
 2   departure_delay             int64  
 3   arrival_delay               int64  
 4   cancelled                   bool   
 5   station_x                   float64
 6   hourlydrybulbtemperature_x  float64
 7   hourlyprecipitation_x       float64
 8   hourlystationpressure_x     float64
 9   hourlyvisibility_x          float64
 10  hourlywindspeed_x           float64
 11  station_y                   float64
 12  hourlydrybulbtemperature_y  float64
 13  hourlyprecipitation_y       float64
 14  hourlystationpressure_y     float64
 15  hourlyvisibility_y          float64
 16  hourlywindspeed_y           float64
 17  origin_lat                  float64
 18  origin_lon                  float64
 19  destination_lat      

In [28]:
df.isnull().sum()

origin_airport                     0
destination_airport                0
departure_delay                    0
arrival_delay                      0
cancelled                          0
station_x                       2073
hourlydrybulbtemperature_x      2073
hourlyprecipitation_x           9881
hourlystationpressure_x         2073
hourlyvisibility_x              2073
hourlywindspeed_x               2073
station_y                       2078
hourlydrybulbtemperature_y      2078
hourlyprecipitation_y           9896
hourlystationpressure_y         2078
hourlyvisibility_y              2078
hourlywindspeed_y               2078
origin_lat                    382438
origin_lon                    382438
destination_lat               382775
destination_lon               382775
dtype: int64

In [29]:
382438 / df.shape[0]

0.06994022935701799

In [30]:
df_new = df.dropna().copy()

In [31]:
df_new.isnull().sum()

origin_airport                0
destination_airport           0
departure_delay               0
arrival_delay                 0
cancelled                     0
station_x                     0
hourlydrybulbtemperature_x    0
hourlyprecipitation_x         0
hourlystationpressure_x       0
hourlyvisibility_x            0
hourlywindspeed_x             0
station_y                     0
hourlydrybulbtemperature_y    0
hourlyprecipitation_y         0
hourlystationpressure_y       0
hourlyvisibility_y            0
hourlywindspeed_y             0
origin_lat                    0
origin_lon                    0
destination_lat               0
destination_lon               0
dtype: int64

In [32]:
df_new['cancelled'].value_counts()

False    4674943
True       33957
Name: cancelled, dtype: int64

# Do Undersampling due to imbalance in data

In [33]:
df_new_f = df_new[~df_new['cancelled']].sample(33957)
df_new_t = df_new[df_new['cancelled']]

In [34]:
df_new_f.shape

(33957, 21)

In [35]:
33957 * 2

67914

In [36]:
df_new_t.shape

(33957, 21)

In [37]:
df_final = pd.concat([df_new_f, df_new_t], axis = 0)# .reset_index(drop = True)

In [38]:
df_final.shape

(67914, 21)

In [39]:
df_final

Unnamed: 0_level_0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2893899,EWR,LAX,-10,-19,False,7.250201e+10,80.0,0.00,29.90,10.00,...,7.229502e+10,84.0,0.00,29.49,10.0,13.0,40.692,-74.169,33.942,-118.408
595660,OAK,PDX,-3,-4,False,7.249302e+10,58.0,0.00,29.97,10.00,...,7.269802e+10,66.0,0.00,29.92,10.0,7.0,37.721,-122.221,45.589,-122.597
3690651,DEN,RNO,26,74,False,7.256500e+10,19.0,0.05,24.71,0.25,...,7.248802e+10,43.0,0.00,25.82,10.0,8.0,39.858,-104.667,39.498,-119.768
686524,SEA,DEN,8,15,False,7.279302e+10,74.0,0.00,29.57,10.00,...,7.256500e+10,59.0,0.00,24.65,10.0,8.0,47.449,-122.309,39.858,-104.667
5397523,PHX,ONT,108,109,False,7.227802e+10,49.0,0.04,28.69,10.00,...,7.470400e+10,44.0,0.00,28.84,10.0,5.0,33.434,-112.008,34.056,-117.601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5506834,MEM,ORD,0,0,True,7.233401e+10,49.0,0.00,29.71,10.00,...,7.253009e+10,28.0,0.00,29.07,10.0,14.0,35.042,-89.977,41.979,-87.904
5509241,SAN,BWI,0,0,True,7.229002e+10,74.0,0.00,29.95,10.00,...,7.240609e+10,47.0,0.00,29.54,10.0,14.0,32.733,-117.189,39.175,-76.668
5510177,ORD,MQT,0,0,True,7.253009e+10,28.0,0.00,29.08,10.00,...,7.262849e+10,21.0,0.01,28.28,10.0,8.0,41.979,-87.904,46.534,-87.562
5510224,SJC,MCO,0,0,True,7.249452e+10,62.0,0.00,30.10,10.00,...,7.220501e+10,58.0,0.00,29.98,10.0,5.0,37.362,-121.929,28.429,-81.316


In [40]:
df_final = df_final.sample(frac=1).reset_index(drop = True)

In [41]:
df_final

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,DFW,MFE,0,0,True,7.225900e+10,75.0,0.00,29.37,10.0,...,7.225061e+10,82.0,0.00,29.84,10.0,7.0,32.896,-97.037,26.176,-98.239
1,LAS,LAX,0,0,True,7.238602e+10,59.0,0.00,27.37,10.0,...,7.229502e+10,54.0,0.02,29.54,2.5,11.0,36.080,-115.152,33.942,-118.408
2,DEN,DAL,0,0,True,7.256500e+10,66.0,0.00,24.78,10.0,...,7.225801e+10,89.0,0.00,29.47,10.0,6.0,39.858,-104.667,32.847,-96.852
3,FLL,BNA,2,14,False,7.478301e+10,68.0,0.00,29.90,10.0,...,7.232701e+10,37.0,0.00,29.48,10.0,10.0,26.072,-80.153,36.124,-86.678
4,PBI,DCA,0,0,True,7.220301e+10,88.0,0.00,29.82,10.0,...,7.240501e+10,94.0,0.00,29.82,10.0,10.0,26.683,-80.096,38.852,-77.037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67909,DCA,DAL,0,0,True,7.240501e+10,55.0,0.00,29.98,10.0,...,7.225801e+10,64.0,0.00,29.39,10.0,15.0,38.852,-77.037,32.847,-96.852
67910,IAH,PNS,-3,-1,False,7.224301e+10,87.0,0.00,29.87,10.0,...,7.222231e+10,88.0,0.00,29.79,10.0,5.0,29.980,-95.340,30.473,-87.187
67911,CLT,TPA,0,0,True,7.231401e+10,68.0,0.00,28.96,8.0,...,7.221101e+10,83.0,0.00,29.94,10.0,7.0,35.214,-80.943,27.975,-82.533
67912,MSP,ATL,90,114,False,7.265801e+10,20.0,0.03,28.75,9.0,...,7.221901e+10,54.0,0.00,28.88,8.0,5.0,44.880,-93.217,33.640,-84.427


In [42]:
df_final['cancelled'].value_counts()

True     33957
False    33957
Name: cancelled, dtype: int64

In [43]:
df_final.sample()

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
30420,AUS,JFK,-11,-18,False,72254010000.0,81.0,0.0,29.48,10.0,...,74486090000.0,75.0,0.0,30.03,10.0,11.0,30.194,-97.67,40.64,-73.779


In [44]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67914 entries, 0 to 67913
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   origin_airport              67914 non-null  object 
 1   destination_airport         67914 non-null  object 
 2   departure_delay             67914 non-null  int64  
 3   arrival_delay               67914 non-null  int64  
 4   cancelled                   67914 non-null  bool   
 5   station_x                   67914 non-null  float64
 6   hourlydrybulbtemperature_x  67914 non-null  float64
 7   hourlyprecipitation_x       67914 non-null  float64
 8   hourlystationpressure_x     67914 non-null  float64
 9   hourlyvisibility_x          67914 non-null  float64
 10  hourlywindspeed_x           67914 non-null  float64
 11  station_y                   67914 non-null  float64
 12  hourlydrybulbtemperature_y  67914 non-null  float64
 13  hourlyprecipitation_y       679

In [45]:
df_final['origin_airport'].value_counts()

DFW    4584
ORD    3785
DEN    2591
ATL    2256
EWR    2227
       ... 
SPN       2
PPG       1
AKN       1
CDV       1
DLG       1
Name: origin_airport, Length: 205, dtype: int64

In [46]:
origin_max_flights = df_final['origin_airport'].value_counts().copy()[0]

origin_max_flights

4584

In [47]:
thresh = 0.05

In [48]:
origin_max_flights*thresh

229.20000000000002

In [49]:
main_airport_list = df_final['origin_airport'].value_counts()[df_final['origin_airport'].value_counts() > origin_max_flights*thresh].index
main_airport_list

Index(['DFW', 'ORD', 'DEN', 'ATL', 'EWR', 'CLT', 'LGA', 'IAH', 'MCO', 'PHL',
       'DCA', 'BOS', 'LAX', 'SFO', 'DTW', 'MDW', 'PHX', 'SEA', 'LAS', 'BWI',
       'MSP', 'DAL', 'JFK', 'FLL', 'MIA', 'BNA', 'MSY', 'HOU', 'SAN', 'STL',
       'RDU', 'IAD', 'AUS', 'SLC', 'TPA', 'MCI', 'CLE', 'PDX', 'CMH', 'CVG',
       'SJC', 'CHS', 'IND', 'PIT', 'SAT', 'JAX', 'SNA', 'SMF', 'ORF', 'BUF',
       'MKE', 'OKC', 'HNL', 'OAK', 'RIC', 'PBI', 'MEM', 'OMA', 'BDL', 'SJU',
       'GRR', 'SAV', 'PVD', 'BHM', 'BUR', 'ABQ'],
      dtype='object')

In [50]:
len(main_airport_list)

66

In [51]:
df_final['origin_airport'] = df_final['origin_airport'].apply(lambda x : x if x in main_airport_list else 'other')

In [52]:
df_final['origin_airport'].value_counts()

other    8921
DFW      4584
ORD      3785
DEN      2591
ATL      2256
         ... 
SAV       244
PVD       243
BHM       239
BUR       237
ABQ       234
Name: origin_airport, Length: 67, dtype: int64

In [53]:
df_final['destination_airport'].value_counts()

DFW    4555
ORD    3869
DEN    2728
ATL    2315
EWR    2273
       ... 
YAK       3
GUM       3
IAG       2
PPG       2
AKN       1
Name: destination_airport, Length: 204, dtype: int64

In [54]:
destination_max_flights = df_final['destination_airport'].value_counts().copy()[0]

destination_max_flights

4555

In [55]:
destination_max_flights * thresh

227.75

In [56]:
main_destination_airport_list = df_final['destination_airport'].value_counts()[df_final['destination_airport'].value_counts() > destination_max_flights * thresh].index
main_destination_airport_list

Index(['DFW', 'ORD', 'DEN', 'ATL', 'EWR', 'CLT', 'IAH', 'LGA', 'MCO', 'PHL',
       'DCA', 'LAX', 'BOS', 'SFO', 'LAS', 'MDW', 'DTW', 'PHX', 'BWI', 'SEA',
       'JFK', 'MSP', 'DAL', 'FLL', 'MIA', 'HOU', 'IAD', 'RDU', 'BNA', 'MSY',
       'SLC', 'STL', 'AUS', 'SAN', 'TPA', 'MCI', 'PDX', 'CLE', 'PIT', 'CVG',
       'CMH', 'SJC', 'CHS', 'IND', 'JAX', 'SAT', 'SMF', 'MKE', 'SNA', 'ORF',
       'OAK', 'OKC', 'RIC', 'BUF', 'PBI', 'BDL', 'OMA', 'HNL', 'SAV', 'MEM',
       'ABQ', 'GRR', 'SJU', 'BHM', 'SYR'],
      dtype='object')

In [57]:
len(main_destination_airport_list)

65

In [58]:
df_final['destination_airport'] = df_final['destination_airport'].apply(lambda x : x if x in main_destination_airport_list else 'other')

In [59]:
df_final['destination_airport'].value_counts()

other    9083
DFW      4555
ORD      3869
DEN      2728
ATL      2315
         ... 
ABQ       257
GRR       241
SJU       233
BHM       230
SYR       228
Name: destination_airport, Length: 66, dtype: int64

In [60]:
# Separate the features (X) from the target (y)
y = df_final["cancelled"]
X = df_final.drop(columns="cancelled")

In [61]:
X

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,DFW,other,0,0,7.225900e+10,75.0,0.00,29.37,10.0,3.0,7.225061e+10,82.0,0.00,29.84,10.0,7.0,32.896,-97.037,26.176,-98.239
1,LAS,LAX,0,0,7.238602e+10,59.0,0.00,27.37,10.0,16.0,7.229502e+10,54.0,0.02,29.54,2.5,11.0,36.080,-115.152,33.942,-118.408
2,DEN,DAL,0,0,7.256500e+10,66.0,0.00,24.78,10.0,8.0,7.225801e+10,89.0,0.00,29.47,10.0,6.0,39.858,-104.667,32.847,-96.852
3,FLL,BNA,2,14,7.478301e+10,68.0,0.00,29.90,10.0,9.0,7.232701e+10,37.0,0.00,29.48,10.0,10.0,26.072,-80.153,36.124,-86.678
4,PBI,DCA,0,0,7.220301e+10,88.0,0.00,29.82,10.0,18.0,7.240501e+10,94.0,0.00,29.82,10.0,10.0,26.683,-80.096,38.852,-77.037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67909,DCA,DAL,0,0,7.240501e+10,55.0,0.00,29.98,10.0,13.0,7.225801e+10,64.0,0.00,29.39,10.0,15.0,38.852,-77.037,32.847,-96.852
67910,IAH,other,-3,-1,7.224301e+10,87.0,0.00,29.87,10.0,7.0,7.222231e+10,88.0,0.00,29.79,10.0,5.0,29.980,-95.340,30.473,-87.187
67911,CLT,TPA,0,0,7.231401e+10,68.0,0.00,28.96,8.0,13.0,7.221101e+10,83.0,0.00,29.94,10.0,7.0,35.214,-80.943,27.975,-82.533
67912,MSP,ATL,90,114,7.265801e+10,20.0,0.03,28.75,9.0,16.0,7.221901e+10,54.0,0.00,28.88,8.0,5.0,44.880,-93.217,33.640,-84.427


In [62]:
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y, test_size=0.1)
X_train.shape

(61122, 20)

In [63]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((61122, 20), (6792, 20), (61122,), (6792,))

In [64]:
X_train

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
22567,other,MSP,-3,-53,7.277502e+10,33.0,0.00,26.17,10.0,22.0,7.265801e+10,0.0,0.0,29.41,10.0,6.0,47.482,-111.371,44.880,-93.217
47845,MCO,ORD,0,0,7.220501e+10,75.0,0.00,29.93,10.0,8.0,7.253009e+10,17.0,0.0,29.62,10.0,13.0,28.429,-81.316,41.979,-87.904
14508,other,MSP,0,0,7.240951e+10,58.0,0.00,30.03,2.0,6.0,7.265801e+10,26.0,0.0,29.46,10.0,3.0,40.277,-74.813,44.880,-93.217
11466,PHL,STL,9,8,7.240801e+10,88.0,0.00,29.72,10.0,16.0,7.243401e+10,86.0,0.0,29.18,10.0,9.0,39.872,-75.241,38.748,-90.360
3893,SEA,LAX,13,24,7.279302e+10,48.0,0.00,29.35,10.0,8.0,7.229502e+10,69.0,0.0,29.47,10.0,5.0,47.449,-122.309,33.942,-118.408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10681,DTW,MSY,-4,-12,7.253709e+10,74.0,0.00,29.44,10.0,3.0,7.223101e+10,86.0,0.0,29.95,10.0,6.0,42.212,-83.353,29.993,-90.258
14366,SAV,PHL,-2,-19,7.220700e+10,79.0,0.00,30.08,10.0,6.0,7.240801e+10,86.0,0.0,30.07,10.0,6.0,32.127,-81.202,39.872,-75.241
63571,other,DFW,0,0,7.245000e+10,61.0,0.00,28.34,10.0,26.0,7.225900e+10,69.0,0.0,29.28,10.0,14.0,37.650,-97.433,32.896,-97.037
53115,MDW,STL,0,0,7.253401e+10,64.0,0.06,29.04,8.0,15.0,7.243401e+10,91.0,0.0,28.97,10.0,21.0,41.786,-87.752,38.748,-90.360


In [65]:
X_train['station_x'].max(), X_train['station_x'].min()

(91765061705.0, 70026027502.0)

In [66]:
y_test

12026     True
28304    False
18911    False
46387     True
10788     True
         ...  
38919    False
64006    False
23047    False
16034    False
58277    False
Name: cancelled, Length: 6792, dtype: bool

In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [68]:
Le = LabelEncoder()

In [69]:
y_train[:5]

22567    False
47845     True
14508     True
11466    False
3893     False
Name: cancelled, dtype: bool

In [70]:
y_test[:5]

12026     True
28304    False
18911    False
46387     True
10788     True
Name: cancelled, dtype: bool

In [71]:
y_train_cln = Le.fit_transform(y_train)

In [72]:
y_train_cln

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [73]:
Le.classes_

array([False,  True])

In [74]:
y_test_cln = Le.transform(y_test)

In [75]:
y_test_cln

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [76]:
X_train.dtypes

origin_airport                 object
destination_airport            object
departure_delay                 int64
arrival_delay                   int64
station_x                     float64
hourlydrybulbtemperature_x    float64
hourlyprecipitation_x         float64
hourlystationpressure_x       float64
hourlyvisibility_x            float64
hourlywindspeed_x             float64
station_y                     float64
hourlydrybulbtemperature_y    float64
hourlyprecipitation_y         float64
hourlystationpressure_y       float64
hourlyvisibility_y            float64
hourlywindspeed_y             float64
origin_lat                    float64
origin_lon                    float64
destination_lat               float64
destination_lon               float64
dtype: object

In [77]:
obj_col = X_train.dtypes[X.dtypes == 'object'].index.values
num_col = X_train.dtypes[X.dtypes != 'object'].index.values

In [78]:
len(obj_col), len(num_col)

(2, 18)

In [79]:
obj_col

array(['origin_airport', 'destination_airport'], dtype=object)

In [80]:
num_col

array(['departure_delay', 'arrival_delay', 'station_x',
       'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x',
       'hourlywindspeed_x', 'station_y', 'hourlydrybulbtemperature_y',
       'hourlyprecipitation_y', 'hourlystationpressure_y',
       'hourlyvisibility_y', 'hourlywindspeed_y', 'origin_lat',
       'origin_lon', 'destination_lat', 'destination_lon'], dtype=object)

In [81]:
X.shape

(67914, 20)

In [82]:
clf = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown = 'ignore'), obj_col),
    ('ss', StandardScaler(), num_col)
], remainder='passthrough')

In [83]:
X_train

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
22567,other,MSP,-3,-53,7.277502e+10,33.0,0.00,26.17,10.0,22.0,7.265801e+10,0.0,0.0,29.41,10.0,6.0,47.482,-111.371,44.880,-93.217
47845,MCO,ORD,0,0,7.220501e+10,75.0,0.00,29.93,10.0,8.0,7.253009e+10,17.0,0.0,29.62,10.0,13.0,28.429,-81.316,41.979,-87.904
14508,other,MSP,0,0,7.240951e+10,58.0,0.00,30.03,2.0,6.0,7.265801e+10,26.0,0.0,29.46,10.0,3.0,40.277,-74.813,44.880,-93.217
11466,PHL,STL,9,8,7.240801e+10,88.0,0.00,29.72,10.0,16.0,7.243401e+10,86.0,0.0,29.18,10.0,9.0,39.872,-75.241,38.748,-90.360
3893,SEA,LAX,13,24,7.279302e+10,48.0,0.00,29.35,10.0,8.0,7.229502e+10,69.0,0.0,29.47,10.0,5.0,47.449,-122.309,33.942,-118.408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10681,DTW,MSY,-4,-12,7.253709e+10,74.0,0.00,29.44,10.0,3.0,7.223101e+10,86.0,0.0,29.95,10.0,6.0,42.212,-83.353,29.993,-90.258
14366,SAV,PHL,-2,-19,7.220700e+10,79.0,0.00,30.08,10.0,6.0,7.240801e+10,86.0,0.0,30.07,10.0,6.0,32.127,-81.202,39.872,-75.241
63571,other,DFW,0,0,7.245000e+10,61.0,0.00,28.34,10.0,26.0,7.225900e+10,69.0,0.0,29.28,10.0,14.0,37.650,-97.433,32.896,-97.037
53115,MDW,STL,0,0,7.253401e+10,64.0,0.06,29.04,8.0,15.0,7.243401e+10,91.0,0.0,28.97,10.0,21.0,41.786,-87.752,38.748,-90.360


In [84]:
X_train_cln = clf.fit_transform(X_train)
X_test_cln = clf.transform(X_test)

In [85]:
X_train_cln

array([[ 0.        ,  0.        ,  0.        , ..., -1.15923121,
         1.42863105, -0.07265365],
       [ 0.        ,  0.        ,  0.        , ...,  0.63279066,
         0.91735072,  0.24472368],
       [ 0.        ,  0.        ,  0.        , ...,  1.02053042,
         1.42863105, -0.07265365],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.32818144,
        -0.68346252, -0.30084513],
       [ 0.        ,  0.        ,  0.        , ...,  0.24904577,
         0.34791028,  0.09801208],
       [ 0.        ,  0.        ,  0.        , ...,  0.51133505,
         1.88139877, -1.81049311]])

In [86]:
# X_train => X_train_cln
# X_train => X_tes_cln

In [87]:
X_train_cln.shape, X_test_cln.shape

((61122, 151), (6792, 151))

In [88]:
# Create Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [89]:
# Fit(Train) the data
classifier.fit(X_train_cln, y_train_cln)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [90]:
# Make predictions using the test data
y_pred = classifier.predict(X_test_cln)
results = pd.DataFrame({
    "Prediction": y_pred, 
    "Cancelled": y_test_cln
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Cancelled
0,1,1
1,1,0
2,0,0
3,1,1
4,1,1


In [91]:
# Validate using test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test_cln, y_pred)

0.7137809187279152

In [92]:
from sklearn.metrics import confusion_matrix

In [93]:
# Generate confusion_matrix
cm_arr = confusion_matrix(y_test_cln, y_pred)

In [94]:
import dataframe_image as dfi

In [95]:
mult_ix = pd.MultiIndex.from_tuples([
    ('actual','cancelled'),
    ('actual','not cancelled')
])

mult_cols = pd.MultiIndex.from_tuples([
    ('predicted','cancelled'),
    ('predicted','not cancelled')
])

cm_df = pd.DataFrame(
    data=cm_arr,
    index=mult_ix,
    columns=mult_cols
)


cm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,cancelled,not cancelled
actual,cancelled,2437,959
actual,not cancelled,985,2411


In [96]:
dfi.export(cm_df,os.path.join('.','images','mlm_t01_logistic_confusion_matrix.png'))

In [97]:
# Generate classification report
from sklearn.metrics import classification_report

In [98]:
print(classification_report(y_test_cln, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.72      0.71      3396
           1       0.72      0.71      0.71      3396

    accuracy                           0.71      6792
   macro avg       0.71      0.71      0.71      6792
weighted avg       0.71      0.71      0.71      6792

