# Retrieve Data from Database

In order to connect to the database, first, make sure:
1. …you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.
2. …the `airports` and `flights_and_weather` tables have data.

If the `airports` and `flights_and_weather` tables do not have data, you must first run `/download_and_prepare_datasets/02_prepare_and_store_data.ipynb` to populate the database.

In [1]:
import os
import time

In [2]:
os.chdir(os.path.join('..','resources'))

## Connect to Database

In [3]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is already in `cfg` (above). However, you will have to enter your password below.

In [4]:
password = getpass('Enter database password')

Enter database password········


In [5]:
start_time = time.time()

In [6]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [7]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

## Access Database Tables and Create Table Aliases

In [8]:
db_meta = db.MetaData()

In [9]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)
faw_table = db.Table('flights_and_weather', db_meta, autoload_with=engine)
faw = faw_table.alias('faw')
orig = airports_table.alias('orig')
dest = airports_table.alias('dest')

## Extract Full, Joined Dataset to Pandas Dataframe

In [10]:
from io import StringIO
import pandas as pd

In [11]:
# Function to extract from database query to Pandas dataframe
# Adapted from <https://towardsdatascience.com/optimizing-pandas-read-sql-for-postgres-f31cd7f707ab>
def read_sql_inmem(query, db_engine, **kwargs):
    copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
       query=query, head="HEADER"
    )
    conn = db_engine.raw_connection()
    cur = conn.cursor()
    store = StringIO()
    cur.copy_expert(copy_sql, store)
    store.seek(0)
    df = pd.read_csv(store, **kwargs)
    return df

In [12]:
# The query composed as an SQLAlchemy table object
get_all = (
    db.select([
        faw,
        orig.c.lat_decimal.label('origin_lat'),
        orig.c.lon_decimal.label('origin_lon'),
        dest.c.lat_decimal.label('destination_lat'),
        dest.c.lon_decimal.label('destination_lon')
    ])
    .select_from(
        faw
        .join(orig, orig.c.iata_code == faw.c.origin_airport, isouter=True)
        .join(dest, dest.c.iata_code == faw.c.destination_airport, isouter=True)
    )
    .order_by(faw.c.id)
)

In [13]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

In [14]:
def df_details(df):
    return pd.concat(
        [
            df.dtypes,
            df.isna().sum(),
            pd.Series(
                data=[df[col].dropna().is_unique for col in df.columns],
                index=df.columns
            )
        ],
        axis=1,
        keys=['data_type','null_count','unique']
    )

In [15]:
# Run the query, and save the result to a dataframe
combined_df = read_sql_inmem(
    query=get_all,
#     query=get_all.limit(10).compile(engine, compile_kwargs={"literal_binds": True}),
    db_engine=engine,
    index_col='id',
    converters={'cancelled':(lambda x: True if x == 't' else False)}
)
# .sort_index(kind='mergesort')

print_shape(combined_df)

5,468,069 rows × 35 columns


In [16]:
with pd.option_context('display.max_columns',None):
    display(combined_df.head())

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircarft_arrival,cancelled,scheduled_departure_dt,scheduled_arrival_dt,actual_departure_dt,actual_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,AS,121,SEA,ANC,2019-07-01,215,N611AS,0,-16,0,0,0,0,0,False,2019-07-01 00:40:00,2019-07-01 03:15:00,2019-07-01 00:40:00,2019-07-01 02:59:00,72793020000.0,65.0,0.0,29.6,10.0,6.0,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,B6,1092,BQN,FLL,2019-07-01,155,N589JB,-19,-40,0,0,0,0,0,False,2019-07-01 01:26:00,2019-07-01 04:01:00,2019-07-01 01:07:00,2019-07-01 03:21:00,78514010000.0,79.0,,29.81,10.0,9.0,74783010000.0,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,B6,668,PSE,MCO,2019-07-01,175,N662JB,73,69,0,0,0,0,69,False,2019-07-01 01:15:00,2019-07-01 04:10:00,2019-07-01 02:28:00,2019-07-01 05:19:00,,,,,,,72205010000.0,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,F9,402,LAX,DEN,2019-07-01,143,N706FR,-1,-7,0,0,0,0,0,False,2019-07-01 00:59:00,2019-07-01 04:22:00,2019-07-01 00:58:00,2019-07-01 04:15:00,72295020000.0,65.0,0.0,29.6,10.0,6.0,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,F9,790,PDX,DEN,2019-07-01,156,N350FR,21,10,0,0,0,0,0,False,2019-07-01 00:55:00,2019-07-01 04:31:00,2019-07-01 01:16:00,2019-07-01 04:41:00,72698020000.0,66.0,0.0,29.92,10.0,6.0,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667


In [17]:
df_details(combined_df)

Unnamed: 0,data_type,null_count,unique
carrier_code,object,0,False
flight_number,int64,0,False
origin_airport,object,0,False
destination_airport,object,0,False
flight_date,object,0,False
scheduled_elapsed_time,int64,0,False
tail_number,object,6973,False
departure_delay,int64,0,False
arrival_delay,int64,0,False
delay_carrier,int64,0,False


In [18]:
print(f'Elapsed time: {(time.time() - start_time):,.3f} seconds.')

Elapsed time: 382.805 seconds.


In [3]:
# from path import Path
import numpy as np
import pandas as pd

In [4]:
# Read in the data
ddir = r"/Users/neesha/Desktop/dataexport"
ddir
pt = os.path.join(ddir,'combined.csv')
pt
df = pd.read_csv(pt)
df.head()

Unnamed: 0,id,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,0,AS,121,SEA,ANC,2019-07-01,215,N611AS,0,-16,...,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,1,B6,1092,BQN,FLL,2019-07-01,155,N589JB,-19,-40,...,74783010000.0,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,2,B6,668,PSE,MCO,2019-07-01,175,N662JB,73,69,...,72205010000.0,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,3,F9,402,LAX,DEN,2019-07-01,143,N706FR,-1,-7,...,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,4,F9,790,PDX,DEN,2019-07-01,156,N350FR,21,10,...,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667


In [5]:
df.columns

Index(['id', 'carrier_code', 'flight_number', 'origin_airport',
       'destination_airport', 'flight_date', 'scheduled_elapsed_time',
       'tail_number', 'departure_delay', 'arrival_delay', 'delay_carrier',
       'delay_weather', 'delay_national_aviation_system', 'delay_security',
       'delay_late_aircarft_arrival', 'cancelled', 'scheduled_departure_dt',
       'scheduled_arrival_dt', 'actual_departure_dt', 'actual_arrival_dt',
       'station_x', 'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x', 'hourlywindspeed_x',
       'station_y', 'hourlydrybulbtemperature_y', 'hourlyprecipitation_y',
       'hourlystationpressure_y', 'hourlyvisibility_y', 'hourlywindspeed_y',
       'origin_lat', 'origin_lon', 'destination_lat', 'destination_lon'],
      dtype='object')

In [6]:
df['cancelled'].value_counts()

f    5426150
t      41919
Name: cancelled, dtype: int64

In [7]:
df

Unnamed: 0,id,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,0,AS,121,SEA,ANC,2019-07-01,215,N611AS,0,-16,...,7.027253e+10,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,1,B6,1092,BQN,FLL,2019-07-01,155,N589JB,-19,-40,...,7.478301e+10,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,2,B6,668,PSE,MCO,2019-07-01,175,N662JB,73,69,...,7.220501e+10,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,3,F9,402,LAX,DEN,2019-07-01,143,N706FR,-1,-7,...,7.256500e+10,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,4,F9,790,PDX,DEN,2019-07-01,156,N350FR,21,10,...,7.256500e+10,62.0,0.0,24.70,8.0,10.0,45.589,-122.597,39.858,-104.667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5468064,5512898,DL,3526,ATL,MGM,2019-05-31,59,N8923A,-1,-18,...,7.222601e+10,74.0,0.0,29.67,10.0,6.0,33.640,-84.427,,
5468065,5512899,DL,508,ATL,BHM,2019-05-31,55,N978DL,-3,-6,...,7.222801e+10,74.0,0.0,29.26,10.0,0.0,33.640,-84.427,33.563,-86.753
5468066,5512900,DL,1075,ATL,HSV,2019-05-31,57,N952AT,31,18,...,7.232300e+10,71.0,0.0,29.20,10.0,3.0,33.640,-84.427,,
5468067,5512901,DL,2436,ATL,BHM,2019-05-31,56,N916DL,28,13,...,7.222801e+10,67.0,0.0,29.24,10.0,0.0,33.640,-84.427,33.563,-86.753


In [8]:
df['scheduled_departure_dt']

0          2019-07-01 00:40:00
1          2019-07-01 01:26:00
2          2019-07-01 01:15:00
3          2019-07-01 00:59:00
4          2019-07-01 00:55:00
                  ...         
5468064    2019-05-31 21:06:00
5468065    2019-05-31 21:26:00
5468066    2019-05-31 22:44:00
5468067    2019-05-31 23:30:00
5468068    2019-05-31 23:38:00
Name: scheduled_departure_dt, Length: 5468069, dtype: object

In [9]:
df.columns

Index(['id', 'carrier_code', 'flight_number', 'origin_airport',
       'destination_airport', 'flight_date', 'scheduled_elapsed_time',
       'tail_number', 'departure_delay', 'arrival_delay', 'delay_carrier',
       'delay_weather', 'delay_national_aviation_system', 'delay_security',
       'delay_late_aircarft_arrival', 'cancelled', 'scheduled_departure_dt',
       'scheduled_arrival_dt', 'actual_departure_dt', 'actual_arrival_dt',
       'station_x', 'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x', 'hourlywindspeed_x',
       'station_y', 'hourlydrybulbtemperature_y', 'hourlyprecipitation_y',
       'hourlystationpressure_y', 'hourlyvisibility_y', 'hourlywindspeed_y',
       'origin_lat', 'origin_lon', 'destination_lat', 'destination_lon'],
      dtype='object')

### 1.) origin_airport
### 2.) destination_airport
### 3.) departure_delay
### 4.) arrival_delay
### 5.) 'cancelled'
### 6.) station_x
### 7.) hourlydrybulbtemperature_x
### 8.) hourlyprecipitation_x
### 9.) hourlystationpressure_x
### 10.) 'hourlyvisibility_x', 
### 11.) 'hourlywindspeed_x',
### 12.) 'station_y', 
### 13.) 'hourlydrybulbtemperature_y', 
### 15.) 'hourlyprecipitation_y',
###   16.)  'hourlystationpressure_y',
### 17.) 'hourlyvisibility_y',
### 18.) 'hourlywindspeed_y',
### 19.) 'origin_lat', 
### 20 .) 'origin_lon', 
### 21.) 'destination_lat',
### 22.) 'destination_lon'

In [10]:
columns = ['origin_airport',
            'destination_airport',
            'departure_delay',
            'arrival_delay',
            'cancelled',
            'station_x',
            'hourlydrybulbtemperature_x',
            'hourlyprecipitation_x',
            'hourlystationpressure_x',
            'hourlyvisibility_x', 
            'hourlywindspeed_x',
            'station_y', 
            'hourlydrybulbtemperature_y', 
            'hourlyprecipitation_y',
            'hourlystationpressure_y',
            'hourlyvisibility_y',
            'hourlywindspeed_y',
            'origin_lat', 
            'origin_lon', 
            'destination_lat',
            'destination_lon']
columns

['origin_airport',
 'destination_airport',
 'departure_delay',
 'arrival_delay',
 'cancelled',
 'station_x',
 'hourlydrybulbtemperature_x',
 'hourlyprecipitation_x',
 'hourlystationpressure_x',
 'hourlyvisibility_x',
 'hourlywindspeed_x',
 'station_y',
 'hourlydrybulbtemperature_y',
 'hourlyprecipitation_y',
 'hourlystationpressure_y',
 'hourlyvisibility_y',
 'hourlywindspeed_y',
 'origin_lat',
 'origin_lon',
 'destination_lat',
 'destination_lon']

In [11]:
len(columns)

21

In [12]:
df = df[columns]

In [13]:
df

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,SEA,ANC,0,-16,f,7.279302e+10,65.0,0.0,29.60,10.0,...,7.027253e+10,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,BQN,FLL,-19,-40,f,7.851401e+10,79.0,,29.81,10.0,...,7.478301e+10,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,PSE,MCO,73,69,f,,,,,,...,7.220501e+10,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,LAX,DEN,-1,-7,f,7.229502e+10,65.0,0.0,29.60,10.0,...,7.256500e+10,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,PDX,DEN,21,10,f,7.269802e+10,66.0,0.0,29.92,10.0,...,7.256500e+10,62.0,0.0,24.70,8.0,10.0,45.589,-122.597,39.858,-104.667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5468064,ATL,MGM,-1,-18,f,7.221901e+10,78.0,0.0,28.84,10.0,...,7.222601e+10,74.0,0.0,29.67,10.0,6.0,33.640,-84.427,,
5468065,ATL,BHM,-3,-6,f,7.221901e+10,77.0,0.0,28.84,10.0,...,7.222801e+10,74.0,0.0,29.26,10.0,0.0,33.640,-84.427,33.563,-86.753
5468066,ATL,HSV,31,18,f,7.221901e+10,74.0,0.0,28.84,10.0,...,7.232300e+10,71.0,0.0,29.20,10.0,3.0,33.640,-84.427,,
5468067,ATL,BHM,28,13,f,7.221901e+10,73.0,0.0,28.84,10.0,...,7.222801e+10,67.0,0.0,29.24,10.0,0.0,33.640,-84.427,33.563,-86.753


In [14]:
df['cancelled'].value_counts()

f    5426150
t      41919
Name: cancelled, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5468069 entries, 0 to 5468068
Data columns (total 21 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   origin_airport              object 
 1   destination_airport         object 
 2   departure_delay             int64  
 3   arrival_delay               int64  
 4   cancelled                   object 
 5   station_x                   float64
 6   hourlydrybulbtemperature_x  float64
 7   hourlyprecipitation_x       float64
 8   hourlystationpressure_x     float64
 9   hourlyvisibility_x          float64
 10  hourlywindspeed_x           float64
 11  station_y                   float64
 12  hourlydrybulbtemperature_y  float64
 13  hourlyprecipitation_y       float64
 14  hourlystationpressure_y     float64
 15  hourlyvisibility_y          float64
 16  hourlywindspeed_y           float64
 17  origin_lat                  float64
 18  origin_lon                  float64
 19  destination_lat      

In [16]:
df.isnull().sum()

origin_airport                     0
destination_airport                0
departure_delay                    0
arrival_delay                      0
cancelled                          0
station_x                       2073
hourlydrybulbtemperature_x      2073
hourlyprecipitation_x           9881
hourlystationpressure_x         2073
hourlyvisibility_x              2073
hourlywindspeed_x               2073
station_y                       2078
hourlydrybulbtemperature_y      2078
hourlyprecipitation_y           9896
hourlystationpressure_y         2078
hourlyvisibility_y              2078
hourlywindspeed_y               2078
origin_lat                    382438
origin_lon                    382438
destination_lat               382775
destination_lon               382775
dtype: int64

In [17]:
382438 / df.shape[0]

0.06994022935701799

In [18]:
df_new = df.dropna().copy()

In [19]:
df_new.isnull().sum()

origin_airport                0
destination_airport           0
departure_delay               0
arrival_delay                 0
cancelled                     0
station_x                     0
hourlydrybulbtemperature_x    0
hourlyprecipitation_x         0
hourlystationpressure_x       0
hourlyvisibility_x            0
hourlywindspeed_x             0
station_y                     0
hourlydrybulbtemperature_y    0
hourlyprecipitation_y         0
hourlystationpressure_y       0
hourlyvisibility_y            0
hourlywindspeed_y             0
origin_lat                    0
origin_lon                    0
destination_lat               0
destination_lon               0
dtype: int64

In [20]:
df_new['cancelled'].value_counts()

f    4674943
t      33957
Name: cancelled, dtype: int64

# Do Undersampling due to imbalance in data

In [21]:
df_new_f = df_new[df_new['cancelled'] == 'f'].sample(33957)
df_new_t = df_new[df_new['cancelled'] == 't']

In [22]:
df_new_f.shape

(33957, 21)

In [23]:
33957 * 2

67914

In [24]:
df_new_t.shape

(33957, 21)

In [25]:
df_final = pd.concat([df_new_f, df_new_t], axis = 0)# .reset_index(drop = True)

In [26]:
df_final.shape

(67914, 21)

In [27]:
df_final

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
223140,ATL,MLB,-4,-11,f,7.221901e+10,80.0,0.00,28.91,10.0,...,7.220401e+10,91.0,0.0,30.00,10.0,11.0,33.640,-84.427,28.103,-80.645
1603451,IAH,LAX,-4,-11,f,7.224301e+10,87.0,0.00,29.81,10.0,...,7.229502e+10,75.0,0.0,29.57,10.0,3.0,29.980,-95.340,33.942,-118.408
362221,OAK,SNA,0,-3,f,7.249302e+10,67.0,0.00,29.83,10.0,...,7.229779e+10,73.0,0.0,29.90,10.0,10.0,37.721,-122.221,33.676,-117.868
1964037,HOU,DAL,-2,-13,f,7.224401e+10,54.0,0.00,29.92,10.0,...,7.225801e+10,60.0,0.0,29.38,10.0,13.0,29.645,-95.279,32.847,-96.852
2286403,CVG,FLL,1,-11,f,7.242109e+10,49.0,0.00,29.09,10.0,...,7.478301e+10,80.0,0.0,30.05,10.0,14.0,39.046,-84.662,26.072,-80.153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5463047,CAE,CLT,0,0,t,7.231001e+10,86.0,0.00,29.56,10.0,...,7.231401e+10,71.0,0.0,29.05,10.0,0.0,33.939,-81.119,35.214,-80.943
5463530,RDU,EWR,0,0,t,7.230601e+10,70.0,0.04,29.30,10.0,...,7.250201e+10,75.0,0.0,29.77,10.0,9.0,35.877,-78.787,40.692,-74.169
5464231,CAE,DCA,0,0,t,7.231001e+10,86.0,0.00,29.56,10.0,...,7.240501e+10,77.0,0.0,29.76,10.0,6.0,33.939,-81.119,38.852,-77.037
5464827,TYS,CLT,0,0,t,7.232601e+10,72.0,0.00,28.88,10.0,...,7.231401e+10,70.0,0.0,29.08,10.0,3.0,35.812,-83.993,35.214,-80.943


In [28]:
df_final = df_final.sample(frac=1).reset_index(drop = True)

In [29]:
df_final

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,MYR,DTW,0,0,t,7.479101e+10,74.0,0.19,29.19,2.0,...,7.253709e+10,66.0,0.0,29.37,10.0,3.0,33.680,-78.928,42.212,-83.353
1,MCO,BUF,0,0,t,7.220501e+10,84.0,0.00,29.66,10.0,...,7.252801e+10,70.0,0.0,29.25,10.0,13.0,28.429,-81.316,42.940,-78.732
2,EWR,RIC,0,0,t,7.250201e+10,75.0,0.09,29.72,10.0,...,7.240101e+10,80.0,0.0,29.62,10.0,18.0,40.692,-74.169,37.505,-77.319
3,JFK,IND,14,24,f,7.448609e+10,61.0,0.00,29.75,10.0,...,7.243809e+10,68.0,0.0,29.13,10.0,11.0,40.640,-73.779,39.717,-86.294
4,MSP,MSY,0,0,t,7.265801e+10,77.0,0.00,29.24,10.0,...,7.223101e+10,93.0,0.0,29.94,10.0,9.0,44.880,-93.217,29.993,-90.258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67909,IAH,JAN,0,0,t,7.224301e+10,83.0,0.00,29.64,10.0,...,7.223500e+10,83.0,0.0,29.52,10.0,9.0,29.980,-95.340,32.311,-90.076
67910,BNA,LAX,-5,-5,f,7.232701e+10,67.0,0.00,29.45,10.0,...,7.229502e+10,64.0,0.0,29.60,7.0,7.0,36.124,-86.678,33.942,-118.408
67911,PHL,DTW,0,5,f,7.240801e+10,83.0,0.00,29.81,10.0,...,7.253709e+10,74.0,0.0,29.25,10.0,0.0,39.872,-75.241,42.212,-83.353
67912,DTW,IAH,0,0,t,7.253709e+10,50.0,0.00,29.56,10.0,...,7.224301e+10,83.0,0.0,29.64,10.0,10.0,42.212,-83.353,29.980,-95.340


In [30]:
df_final['cancelled'].value_counts()

t    33957
f    33957
Name: cancelled, dtype: int64

In [31]:
df_final.sample()

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,cancelled,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
1603,ATL,LNK,17,2,f,72219010000.0,43.0,0.12,28.94,2.0,...,72551010000.0,38.0,0.0,28.74,10.0,7.0,33.64,-84.427,40.851,-96.759


In [32]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67914 entries, 0 to 67913
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   origin_airport              67914 non-null  object 
 1   destination_airport         67914 non-null  object 
 2   departure_delay             67914 non-null  int64  
 3   arrival_delay               67914 non-null  int64  
 4   cancelled                   67914 non-null  object 
 5   station_x                   67914 non-null  float64
 6   hourlydrybulbtemperature_x  67914 non-null  float64
 7   hourlyprecipitation_x       67914 non-null  float64
 8   hourlystationpressure_x     67914 non-null  float64
 9   hourlyvisibility_x          67914 non-null  float64
 10  hourlywindspeed_x           67914 non-null  float64
 11  station_y                   67914 non-null  float64
 12  hourlydrybulbtemperature_y  67914 non-null  float64
 13  hourlyprecipitation_y       679

In [33]:
df_final['origin_airport'].value_counts()

DFW    4487
ORD    3757
DEN    2614
ATL    2250
EWR    2237
       ... 
CDV       2
PPG       1
SPN       1
AKN       1
DLG       1
Name: origin_airport, Length: 205, dtype: int64

In [35]:
4487*0.05

224.35000000000002

In [36]:
main_airport_list = df_final['origin_airport'].value_counts()[df_final['origin_airport'].value_counts() > 4603*0.04].index
main_airport_list

Index(['DFW', 'ORD', 'DEN', 'ATL', 'EWR', 'CLT', 'IAH', 'LGA', 'MCO', 'PHL',
       'LAX', 'DCA', 'BOS', 'SFO', 'DTW', 'PHX', 'MDW', 'LAS', 'BWI', 'SEA',
       'MSP', 'DAL', 'JFK', 'FLL', 'MIA', 'BNA', 'HOU', 'STL', 'MSY', 'RDU',
       'SAN', 'IAD', 'SLC', 'AUS', 'MCI', 'TPA', 'PDX', 'PIT', 'CLE', 'SJC',
       'IND', 'CMH', 'CVG', 'JAX', 'CHS', 'SAT', 'SMF', 'ORF', 'SNA', 'OAK',
       'MKE', 'PBI', 'HNL', 'OKC', 'BUF', 'RIC', 'MEM', 'BDL', 'SAV', 'OMA',
       'ABQ', 'BUR', 'SJU', 'GRR', 'BHM', 'SYR', 'ALB', 'PVD', 'RSW', 'MYR',
       'TUL', 'ROC', 'TYS'],
      dtype='object')

In [37]:
len(main_airport_list)

73

In [38]:
df_final['origin_airport'] = df_final['origin_airport'].apply(lambda x : x if x in main_airport_list else 'other')

In [39]:
df_final['origin_airport'].value_counts()

other    7646
DFW      4487
ORD      3757
DEN      2614
ATL      2250
         ... 
RSW       209
MYR       208
TUL       204
ROC       198
TYS       185
Name: origin_airport, Length: 74, dtype: int64

In [40]:
df_final['destination_airport'].value_counts()

DFW    4606
ORD    3785
DEN    2672
ATL    2371
EWR    2277
       ... 
ADK       3
HIB       2
SCC       2
BFM       2
PPG       1
Name: destination_airport, Length: 203, dtype: int64

In [41]:
4606 * 0.05

230.3

In [42]:
main_destination_airport_list = df_final['destination_airport'].value_counts()[df_final['destination_airport'].value_counts() > 4610 * 0.04].index
main_destination_airport_list

Index(['DFW', 'ORD', 'DEN', 'ATL', 'EWR', 'CLT', 'IAH', 'LGA', 'MCO', 'PHL',
       'LAX', 'DCA', 'BOS', 'SFO', 'PHX', 'LAS', 'DTW', 'MDW', 'SEA', 'BWI',
       'MSP', 'FLL', 'JFK', 'DAL', 'MIA', 'IAD', 'BNA', 'STL', 'MSY', 'HOU',
       'SAN', 'AUS', 'RDU', 'SLC', 'MCI', 'TPA', 'PDX', 'CLE', 'PIT', 'CMH',
       'CHS', 'CVG', 'IND', 'JAX', 'SAT', 'SJC', 'SMF', 'MKE', 'OAK', 'ORF',
       'SNA', 'HNL', 'RIC', 'MEM', 'BUF', 'OKC', 'PBI', 'OMA', 'BDL', 'SAV',
       'ABQ', 'ALB', 'BHM', 'BUR', 'GRR', 'SYR', 'SJU', 'ROC', 'RSW', 'ANC',
       'TUL', 'PVD'],
      dtype='object')

In [43]:
len(main_destination_airport_list)

72

In [44]:
df_final['destination_airport'] = df_final['destination_airport'].apply(lambda x : x if x in main_destination_airport_list else 'other')

In [45]:
df_final['destination_airport'].value_counts()

other    7485
DFW      4606
ORD      3785
DEN      2672
ATL      2371
         ... 
RSW       208
ROC       208
ANC       207
TUL       204
PVD       202
Name: destination_airport, Length: 73, dtype: int64

In [46]:
# Separate the features (X) from the target (y)
y = df_final["cancelled"]
X = df_final.drop(columns="cancelled")

In [50]:
X

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,DFW,MSY,-7,-17,7.225900e+10,60.0,0.00,29.25,10.0,7.0,7.223101e+10,64.0,0.0,30.00,10.0,3.0,32.896,-97.037,29.993,-90.258
1,MCI,BNA,-2,-14,7.244600e+10,72.0,0.00,29.00,10.0,9.0,7.232701e+10,89.0,0.0,29.49,10.0,8.0,39.297,-94.714,36.124,-86.678
2,BOS,MKE,0,0,7.250901e+10,73.0,0.00,29.79,10.0,14.0,7.264001e+10,67.0,0.0,29.34,10.0,0.0,42.364,-71.005,42.947,-87.896
3,LAS,SNA,-3,-8,7.238602e+10,71.0,0.00,27.78,10.0,7.0,7.229779e+10,70.0,0.0,29.99,8.0,5.0,36.080,-115.152,33.676,-117.868
4,CLT,other,-3,-3,7.231401e+10,43.0,0.00,29.46,10.0,13.0,7.221409e+10,61.0,0.0,30.08,10.0,5.0,35.214,-80.943,30.396,-84.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67909,SAN,SEA,1,1,7.229002e+10,63.0,0.00,29.93,10.0,5.0,7.279302e+10,57.0,0.0,29.53,10.0,5.0,32.733,-117.189,47.449,-122.309
67910,other,DEN,-2,-5,7.246609e+10,44.0,0.00,23.94,10.0,5.0,7.256500e+10,46.0,0.0,24.55,10.0,0.0,38.806,-104.700,39.858,-104.667
67911,other,ORD,2,17,7.255701e+10,44.0,0.00,28.82,10.0,9.0,7.253009e+10,51.0,0.0,29.41,10.0,14.0,42.402,-96.384,41.979,-87.904
67912,ORD,BDL,-1,0,7.253009e+10,73.0,0.42,29.19,8.0,13.0,7.250801e+10,75.0,0.0,29.75,10.0,10.0,41.979,-87.904,41.939,-72.683


In [47]:
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y, test_size=0.1)
X_train.shape

(61122, 20)

In [48]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((61122, 20), (6792, 20), (61122,), (6792,))

In [49]:
X_train

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
22488,SAN,SAT,2,-6,7.229002e+10,75.0,0.0,29.93,10.0,11.0,7.225301e+10,100.0,0.00,28.99,10.0,9.0,32.733,-117.189,29.534,-98.470
47918,DFW,other,0,0,7.225900e+10,73.0,0.0,29.07,10.0,10.0,7.268102e+10,73.0,0.00,26.91,10.0,8.0,32.896,-97.037,43.564,-116.223
14565,ORD,MKE,0,0,7.253009e+10,71.0,0.0,29.24,10.0,11.0,7.264001e+10,67.0,0.00,29.27,10.0,7.0,41.979,-87.904,42.947,-87.896
11452,PIT,other,-6,-7,7.252009e+10,65.0,0.0,28.79,10.0,7.0,7.479101e+10,70.0,0.09,29.93,10.0,11.0,40.491,-80.233,33.680,-78.928
3805,MDW,MCO,1,5,7.253401e+10,64.0,0.0,29.17,10.0,7.0,7.220501e+10,82.0,0.00,29.85,10.0,8.0,41.786,-87.752,28.429,-81.316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10671,SAN,SLC,-4,-15,7.229002e+10,68.0,0.0,29.97,10.0,5.0,7.257202e+10,83.0,0.00,25.73,10.0,10.0,32.733,-117.189,40.788,-111.978
14299,SJC,other,14,17,7.249452e+10,68.0,0.0,29.81,10.0,0.0,7.229702e+10,65.0,0.00,29.89,10.0,6.0,37.362,-121.929,33.818,-118.151
63645,LAX,ORD,0,0,7.229502e+10,73.0,0.0,29.60,10.0,10.0,7.253009e+10,79.0,0.00,29.12,10.0,10.0,33.942,-118.408,41.979,-87.904
53468,DCA,PBI,0,0,7.240501e+10,86.0,0.0,29.96,10.0,5.0,7.220301e+10,82.0,0.00,29.58,10.0,23.0,38.852,-77.037,26.683,-80.096


In [50]:
X_train['station_x'].max(), X_train['station_x'].min()

(91765061705.0, 70026027502.0)

In [51]:
y_test

12044    t
28012    f
18861    f
46511    t
10771    t
        ..
38680    f
63939    f
22940    f
15942    f
58157    f
Name: cancelled, Length: 6792, dtype: object

In [52]:
from sklearn.compose import ColumnTransformer

In [53]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [54]:
Le = LabelEncoder()

In [55]:
y_train[:5]

22488    f
47918    t
14565    t
11452    f
3805     f
Name: cancelled, dtype: object

In [56]:
y_test[:5]

12044    t
28012    f
18861    f
46511    t
10771    t
Name: cancelled, dtype: object

In [57]:
y_train_cln = Le.fit_transform(y_train)

In [58]:
y_train_cln

array([0, 1, 1, ..., 1, 1, 0])

In [59]:
Le.classes_

array(['f', 't'], dtype=object)

In [60]:
y_test_cln = Le.transform(y_test)

In [61]:
y_test_cln

array([1, 0, 0, ..., 0, 0, 0])

In [62]:
X_train.dtypes

origin_airport                 object
destination_airport            object
departure_delay                 int64
arrival_delay                   int64
station_x                     float64
hourlydrybulbtemperature_x    float64
hourlyprecipitation_x         float64
hourlystationpressure_x       float64
hourlyvisibility_x            float64
hourlywindspeed_x             float64
station_y                     float64
hourlydrybulbtemperature_y    float64
hourlyprecipitation_y         float64
hourlystationpressure_y       float64
hourlyvisibility_y            float64
hourlywindspeed_y             float64
origin_lat                    float64
origin_lon                    float64
destination_lat               float64
destination_lon               float64
dtype: object

In [63]:
obj_col = X_train.dtypes[X.dtypes == 'object'].index.values
num_col = X_train.dtypes[X.dtypes != 'object'].index.values

In [64]:
len(obj_col), len(num_col)

(2, 18)

In [65]:
obj_col

array(['origin_airport', 'destination_airport'], dtype=object)

In [66]:
num_col

array(['departure_delay', 'arrival_delay', 'station_x',
       'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x',
       'hourlywindspeed_x', 'station_y', 'hourlydrybulbtemperature_y',
       'hourlyprecipitation_y', 'hourlystationpressure_y',
       'hourlyvisibility_y', 'hourlywindspeed_y', 'origin_lat',
       'origin_lon', 'destination_lat', 'destination_lon'], dtype=object)

In [67]:
X.shape

(67914, 20)

In [69]:
clf = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown = 'ignore'), obj_col),
    ('ss', StandardScaler(), num_col)
], remainder='passthrough')

In [73]:
X_train

Unnamed: 0,origin_airport,destination_airport,departure_delay,arrival_delay,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
22488,SAN,SAT,2,-6,7.229002e+10,75.0,0.0,29.93,10.0,11.0,7.225301e+10,100.0,0.00,28.99,10.0,9.0,32.733,-117.189,29.534,-98.470
47918,DFW,other,0,0,7.225900e+10,73.0,0.0,29.07,10.0,10.0,7.268102e+10,73.0,0.00,26.91,10.0,8.0,32.896,-97.037,43.564,-116.223
14565,ORD,MKE,0,0,7.253009e+10,71.0,0.0,29.24,10.0,11.0,7.264001e+10,67.0,0.00,29.27,10.0,7.0,41.979,-87.904,42.947,-87.896
11452,PIT,other,-6,-7,7.252009e+10,65.0,0.0,28.79,10.0,7.0,7.479101e+10,70.0,0.09,29.93,10.0,11.0,40.491,-80.233,33.680,-78.928
3805,MDW,MCO,1,5,7.253401e+10,64.0,0.0,29.17,10.0,7.0,7.220501e+10,82.0,0.00,29.85,10.0,8.0,41.786,-87.752,28.429,-81.316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10671,SAN,SLC,-4,-15,7.229002e+10,68.0,0.0,29.97,10.0,5.0,7.257202e+10,83.0,0.00,25.73,10.0,10.0,32.733,-117.189,40.788,-111.978
14299,SJC,other,14,17,7.249452e+10,68.0,0.0,29.81,10.0,0.0,7.229702e+10,65.0,0.00,29.89,10.0,6.0,37.362,-121.929,33.818,-118.151
63645,LAX,ORD,0,0,7.229502e+10,73.0,0.0,29.60,10.0,10.0,7.253009e+10,79.0,0.00,29.12,10.0,10.0,33.942,-118.408,41.979,-87.904
53468,DCA,PBI,0,0,7.240501e+10,86.0,0.0,29.96,10.0,5.0,7.220301e+10,82.0,0.00,29.58,10.0,23.0,38.852,-77.037,26.683,-80.096


In [74]:
X_train_cln = clf.fit_transform(X_train)
X_test_cln = clf.transform(X_test)

In [75]:
X_train_cln

array([[ 0.        ,  0.        ,  0.        , ..., -1.47694024,
        -1.26551277, -0.37827086],
       [ 0.        ,  0.        ,  0.        , ..., -0.28838617,
         1.2004544 , -1.43207941],
       [ 0.        ,  0.        ,  0.        , ...,  0.25027323,
         1.09200809,  0.24939609],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -1.5488362 ,
         0.92186865,  0.24892122],
       [ 0.        ,  0.        ,  0.        , ...,  0.89120302,
        -1.76661557,  0.7123999 ],
       [ 0.        ,  0.        ,  0.        , ..., -1.51698729,
         0.34466009, -1.750721  ]])

In [76]:
# X_train => X_train_cln
# X_train => X_tes_cln

In [77]:
X_train_cln.shape, X_test_cln.shape

((61122, 165), (6792, 165))

In [78]:
# Create Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [79]:
# Fit(Train) the data
classifier.fit(X_train_cln, y_train_cln)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=1)

In [80]:
# Make predictions using the test data
y_pred = classifier.predict(X_test_cln)
results = pd.DataFrame({
    "Prediction": y_pred, 
    "Cancelled": y_test_cln
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Cancelled
0,0,1
1,1,0
2,0,0
3,1,1
4,1,1


In [81]:
# Validate using test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test_cln, y_pred)

0.71849234393404

In [82]:
# Generate confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_cln, y_pred)

array([[2416,  980],
       [ 932, 2464]])

In [83]:
# Generate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_cln, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72      3396
           1       0.72      0.73      0.72      3396

    accuracy                           0.72      6792
   macro avg       0.72      0.72      0.72      6792
weighted avg       0.72      0.72      0.72      6792

