# Retrieve Data from Database

In order to connect to the database, first, make sure:
1. …you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.
2. …the `airports` and `flights_and_weather` tables have data.

If the `airports` and `flights_and_weather` tables do not have data, you must first run `/download_and_prepare_datasets/02_prepare_and_store_data.ipynb` to populate the database.

In [10]:
import os
import time

In [11]:
os.chdir(os.path.join('..','resources'))

## Connect to Database

In [14]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is already in `cfg` (above). However, you will have to enter your password below.

In [15]:
password = getpass('Enter database password')

Enter database password········


In [16]:
start_time = time.time()

In [17]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [18]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

## Access Database Tables and Create Table Aliases

In [19]:
db_meta = db.MetaData()

In [20]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)
faw_table = db.Table('flights_and_weather', db_meta, autoload_with=engine)
faw = faw_table.alias('faw')
orig = airports_table.alias('orig')
dest = airports_table.alias('dest')

## Extract Full, Joined Dataset to Pandas Dataframe

In [21]:
from io import StringIO
import pandas as pd

In [22]:
# Function to extract from database query to Pandas dataframe
# Adapted from <https://towardsdatascience.com/optimizing-pandas-read-sql-for-postgres-f31cd7f707ab>
def read_sql_inmem(query, db_engine, **kwargs):
    copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
       query=query, head="HEADER"
    )
    conn = db_engine.raw_connection()
    cur = conn.cursor()
    store = StringIO()
    cur.copy_expert(copy_sql, store)
    store.seek(0)
    df = pd.read_csv(store, **kwargs)
    return df

In [23]:
# The query composed as an SQLAlchemy table object
get_all = (
    db.select([
        faw,
        orig.c.lat_decimal.label('origin_lat'),
        orig.c.lon_decimal.label('origin_lon'),
        dest.c.lat_decimal.label('destination_lat'),
        dest.c.lon_decimal.label('destination_lon')
    ])
    .select_from(
        faw
        .join(orig, orig.c.iata_code == faw.c.origin_airport, isouter=True)
        .join(dest, dest.c.iata_code == faw.c.destination_airport, isouter=True)
    )
    .order_by(faw.c.id)
)

In [24]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

In [25]:
def df_details(df):
    return pd.concat(
        [
            df.dtypes,
            df.isna().sum(),
            pd.Series(
                data=[df[col].dropna().is_unique for col in df.columns],
                index=df.columns
            )
        ],
        axis=1,
        keys=['data_type','null_count','unique']
    )

In [26]:
# Run the query, and save the result to a dataframe
combined_df = read_sql_inmem(
    query=get_all,
#     query=get_all.limit(10).compile(engine, compile_kwargs={"literal_binds": True}),
    db_engine=engine,
    index_col='id',
    converters={'cancelled':(lambda x: True if x == 't' else False)}
)
# .sort_index(kind='mergesort')

print_shape(combined_df)

5,468,069 rows × 35 columns


In [39]:
# "Full-width display" function to display all columns of a dataframe
def fw_disp(df):
    with pd.option_context('display.max_columns',None):
        display(df)

In [40]:
fw_disp(combined_df.head())

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircarft_arrival,cancelled,scheduled_departure_dt,scheduled_arrival_dt,actual_departure_dt,actual_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,AS,121,SEA,ANC,2019-07-01,215,N611AS,0,-16,0,0,0,0,0,False,2019-07-01 00:40:00,2019-07-01 03:15:00,2019-07-01 00:40:00,2019-07-01 02:59:00,72793020000.0,65.0,0.0,29.6,10.0,6.0,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,B6,1092,BQN,FLL,2019-07-01,155,N589JB,-19,-40,0,0,0,0,0,False,2019-07-01 01:26:00,2019-07-01 04:01:00,2019-07-01 01:07:00,2019-07-01 03:21:00,78514010000.0,79.0,,29.81,10.0,9.0,74783010000.0,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,B6,668,PSE,MCO,2019-07-01,175,N662JB,73,69,0,0,0,0,69,False,2019-07-01 01:15:00,2019-07-01 04:10:00,2019-07-01 02:28:00,2019-07-01 05:19:00,,,,,,,72205010000.0,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,F9,402,LAX,DEN,2019-07-01,143,N706FR,-1,-7,0,0,0,0,0,False,2019-07-01 00:59:00,2019-07-01 04:22:00,2019-07-01 00:58:00,2019-07-01 04:15:00,72295020000.0,65.0,0.0,29.6,10.0,6.0,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,F9,790,PDX,DEN,2019-07-01,156,N350FR,21,10,0,0,0,0,0,False,2019-07-01 00:55:00,2019-07-01 04:31:00,2019-07-01 01:16:00,2019-07-01 04:41:00,72698020000.0,66.0,0.0,29.92,10.0,6.0,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667


In [41]:
df_details(combined_df)

Unnamed: 0,data_type,null_count,unique
carrier_code,object,0,False
flight_number,int64,0,False
origin_airport,object,0,False
destination_airport,object,0,False
flight_date,object,0,False
scheduled_elapsed_time,int64,0,False
tail_number,object,6973,False
departure_delay,int64,0,False
arrival_delay,int64,0,False
delay_carrier,int64,0,False


In [42]:
print(f'Elapsed time: {(time.time() - start_time):,.3f} seconds.')

Elapsed time: 1,918.245 seconds.


# Prepare Data for Machine Learning

In [43]:
combined_df[['carrier_code','tail_number']].nunique(dropna=True)

carrier_code      10
tail_number     6111
dtype: int64

In [44]:
# Read in the data
ddir = r"/Users/neesha/Desktop/dataexport"
ddir
pt = os.path.join(ddir,'combined.csv')
pt
df = pd.read_csv(pt)
df.head()

Unnamed: 0,id,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
0,0,AS,121,SEA,ANC,2019-07-01,215,N611AS,0,-16,...,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,1,B6,1092,BQN,FLL,2019-07-01,155,N589JB,-19,-40,...,74783010000.0,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,2,B6,668,PSE,MCO,2019-07-01,175,N662JB,73,69,...,72205010000.0,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,3,F9,402,LAX,DEN,2019-07-01,143,N706FR,-1,-7,...,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,4,F9,790,PDX,DEN,2019-07-01,156,N350FR,21,10,...,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667


In [45]:
df.columns

Index(['id', 'carrier_code', 'flight_number', 'origin_airport',
       'destination_airport', 'flight_date', 'scheduled_elapsed_time',
       'tail_number', 'departure_delay', 'arrival_delay', 'delay_carrier',
       'delay_weather', 'delay_national_aviation_system', 'delay_security',
       'delay_late_aircarft_arrival', 'cancelled', 'scheduled_departure_dt',
       'scheduled_arrival_dt', 'actual_departure_dt', 'actual_arrival_dt',
       'station_x', 'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x', 'hourlywindspeed_x',
       'station_y', 'hourlydrybulbtemperature_y', 'hourlyprecipitation_y',
       'hourlystationpressure_y', 'hourlyvisibility_y', 'hourlywindspeed_y',
       'origin_lat', 'origin_lon', 'destination_lat', 'destination_lon'],
      dtype='object')

In [46]:
df['cancelled'].value_counts()

f    5426150
t      41919
Name: cancelled, dtype: int64

In [47]:
# Select columns to drop
drop_cols = [
    'flight_number',
    'origin_airport',
    'destination_airport',
    'tail_number',
    'departure_delay',
    'arrival_delay',
    'delay_carrier',
    'delay_weather',
    'delay_national_aviation_system',
    'delay_security',
    'delay_late_aircarft_arrival',
    'actual_departure_dt',
    'actual_arrival_dt'
]

### 1.) origin_airport
### 2.) destination_airport
### 3.) departure_delay
### 4.) arrival_delay
### 5.) 'cancelled'
### 6.) station_x
### 7.) hourlydrybulbtemperature_x
### 8.) hourlyprecipitation_x
### 9.) hourlystationpressure_x
### 10.) 'hourlyvisibility_x', 
### 11.) 'hourlywindspeed_x',
### 12.) 'station_y', 
### 13.) 'hourlydrybulbtemperature_y', 
### 15.) 'hourlyprecipitation_y',
###   16.)  'hourlystationpressure_y',
### 17.) 'hourlyvisibility_y',
### 18.) 'hourlywindspeed_y',
### 19.) 'origin_lat', 
### 20 .) 'origin_lon', 
### 21.) 'destination_lat',
### 22.) 'destination_lon'

In [48]:
df = combined_df.drop(columns=drop_cols)

In [50]:
fw_disp(df.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,cancelled,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,AS,2019-07-01,215,False,2019-07-01 00:40:00,2019-07-01 03:15:00,72793020000.0,65.0,0.0,29.6,10.0,6.0,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,B6,2019-07-01,155,False,2019-07-01 01:26:00,2019-07-01 04:01:00,78514010000.0,79.0,,29.81,10.0,9.0,74783010000.0,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,B6,2019-07-01,175,False,2019-07-01 01:15:00,2019-07-01 04:10:00,,,,,,,72205010000.0,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,F9,2019-07-01,143,False,2019-07-01 00:59:00,2019-07-01 04:22:00,72295020000.0,65.0,0.0,29.6,10.0,6.0,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,F9,2019-07-01,156,False,2019-07-01 00:55:00,2019-07-01 04:31:00,72698020000.0,66.0,0.0,29.92,10.0,6.0,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667


In [53]:
# Date and time columns to be converted
dt_cols = [
    'flight_date',
    'scheduled_departure_dt',
    'scheduled_arrival_dt'
]

In [54]:
df[dt_cols].head()

Unnamed: 0_level_0,flight_date,scheduled_departure_dt,scheduled_arrival_dt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2019-07-01,2019-07-01 00:40:00,2019-07-01 03:15:00
1,2019-07-01,2019-07-01 01:26:00,2019-07-01 04:01:00
2,2019-07-01,2019-07-01 01:15:00,2019-07-01 04:10:00
3,2019-07-01,2019-07-01 00:59:00,2019-07-01 04:22:00
4,2019-07-01,2019-07-01 00:55:00,2019-07-01 04:31:00


In [55]:
from datetime import datetime
import numpy as np

In [56]:
# Convert the date and time columns to Unix timestamps (integers)
for col in dt_cols:
    df[col] = pd.to_datetime(df[col]).view(np.int64)

In [57]:
df[dt_cols].dtypes

flight_date               int64
scheduled_departure_dt    int64
scheduled_arrival_dt      int64
dtype: object

In [58]:
fw_disp(df.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,cancelled,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,AS,1561939200000000000,215,False,1561941600000000000,1561950900000000000,72793020000.0,65.0,0.0,29.6,10.0,6.0,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
1,B6,1561939200000000000,155,False,1561944360000000000,1561953660000000000,78514010000.0,79.0,,29.81,10.0,9.0,74783010000.0,78.0,0.0,30.01,10.0,0.0,18.495,-67.129,26.072,-80.153
2,B6,1561939200000000000,175,False,1561943700000000000,1561954200000000000,,,,,,,72205010000.0,76.0,0.0,29.93,10.0,3.0,18.008,-66.563,28.429,-81.316
3,F9,1561939200000000000,143,False,1561942740000000000,1561954920000000000,72295020000.0,65.0,0.0,29.6,10.0,6.0,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,F9,1561939200000000000,156,False,1561942500000000000,1561955460000000000,72698020000.0,66.0,0.0,29.92,10.0,6.0,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667


In [59]:
# Drop rows with `NULL` values
df.dropna(how='any',inplace=True)

print_shape(df)

4,708,900 rows × 22 columns


In [71]:
# Verify that no `NULL` values remain
df_new = df.dropna().copy()

In [72]:
df_new.isnull().sum()

carrier_code                  0
flight_date                   0
scheduled_elapsed_time        0
cancelled                     0
scheduled_departure_dt        0
scheduled_arrival_dt          0
station_x                     0
hourlydrybulbtemperature_x    0
hourlyprecipitation_x         0
hourlystationpressure_x       0
hourlyvisibility_x            0
hourlywindspeed_x             0
station_y                     0
hourlydrybulbtemperature_y    0
hourlyprecipitation_y         0
hourlystationpressure_y       0
hourlyvisibility_y            0
hourlywindspeed_y             0
origin_lat                    0
origin_lon                    0
destination_lat               0
destination_lon               0
dtype: int64

In [73]:
df_new['cancelled'].value_counts()

False    4674943
True       33957
Name: cancelled, dtype: int64

# Test Random Forest

In [79]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [123]:
# Define the features set.
X = df_new.copy()
X = X.drop("cancelled", axis=1)
X.head()

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,AS,1561939200000000000,215,1561941600000000000,1561950900000000000,72793020000.0,65.0,0.0,29.6,10.0,...,70272530000.0,57.0,0.0,30.04,10.0,3.0,47.449,-122.309,61.174,-149.996
3,F9,1561939200000000000,143,1561942740000000000,1561954920000000000,72295020000.0,65.0,0.0,29.6,10.0,...,72565000000.0,62.0,0.0,24.69,10.0,8.0,33.942,-118.408,39.858,-104.667
4,F9,1561939200000000000,156,1561942500000000000,1561955460000000000,72698020000.0,66.0,0.0,29.92,10.0,...,72565000000.0,62.0,0.0,24.7,8.0,10.0,45.589,-122.597,39.858,-104.667
5,F9,1561939200000000000,158,1561942500000000000,1561955580000000000,72494020000.0,58.0,0.0,30.03,10.0,...,72565000000.0,62.0,0.0,24.7,8.0,10.0,37.619,-122.375,39.858,-104.667
6,AS,1561939200000000000,400,1561942500000000000,1561955700000000000,72530090000.0,71.0,0.0,29.24,10.0,...,70272530000.0,57.0,0.0,30.06,10.0,0.0,41.979,-87.904,61.174,-149.996


In [125]:
# Define the target set.
y = df_new["cancelled"]

In [128]:
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y, test_size=0.1)

In [129]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4238010, 21), (470890, 21), (4238010,), (470890,))

In [130]:
from sklearn.compose import ColumnTransformer

In [131]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [132]:
Le = LabelEncoder()

In [133]:
y_train[:5]

id
4248935    False
2850481    False
2146498    False
1530087    False
3241252    False
Name: cancelled, dtype: bool

In [134]:
y_test[:5]

id
3845853    False
3524639    False
3543535    False
1269636    False
1680588    False
Name: cancelled, dtype: bool

In [135]:
y_train_cln = Le.fit_transform(y_train)

In [136]:
y_train_cln

array([0, 0, 0, ..., 0, 0, 0])

In [137]:
Le.classes_

array([False,  True])

In [138]:
y_test_cln = Le.transform(y_test)

In [139]:
y_test_cln

array([0, 0, 0, ..., 0, 0, 0])

In [109]:
X_train.dtypes

carrier_code                   object
flight_date                     int64
scheduled_elapsed_time          int64
scheduled_departure_dt          int64
scheduled_arrival_dt            int64
station_x                     float64
hourlydrybulbtemperature_x    float64
hourlyprecipitation_x         float64
hourlystationpressure_x       float64
hourlyvisibility_x            float64
hourlywindspeed_x             float64
station_y                     float64
hourlydrybulbtemperature_y    float64
hourlyprecipitation_y         float64
hourlystationpressure_y       float64
hourlyvisibility_y            float64
hourlywindspeed_y             float64
origin_lat                    float64
origin_lon                    float64
destination_lat               float64
destination_lon               float64
dtype: object

In [140]:
obj_col = X_train.dtypes[X.dtypes == 'object'].index.values
num_col = X_train.dtypes[X.dtypes != 'object'].index.values

In [141]:
len(obj_col), len(num_col)

(1, 20)

In [142]:
obj_col

array(['carrier_code'], dtype=object)

In [143]:
num_col

array(['flight_date', 'scheduled_elapsed_time', 'scheduled_departure_dt',
       'scheduled_arrival_dt', 'station_x', 'hourlydrybulbtemperature_x',
       'hourlyprecipitation_x', 'hourlystationpressure_x',
       'hourlyvisibility_x', 'hourlywindspeed_x', 'station_y',
       'hourlydrybulbtemperature_y', 'hourlyprecipitation_y',
       'hourlystationpressure_y', 'hourlyvisibility_y',
       'hourlywindspeed_y', 'origin_lat', 'origin_lon', 'destination_lat',
       'destination_lon'], dtype=object)

In [144]:
X.shape

(4708900, 21)

In [145]:
clf = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown = 'ignore'), obj_col),
    ('ss', StandardScaler(), num_col)
], remainder='passthrough')

In [146]:
X_train

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,...,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4248935,NK,1575504000000000000,285,1575570300000000000,1575576600000000000,7.252401e+10,34.0,0.00,29.24,10.00,...,7.238602e+10,53.0,0.00,27.91,10.0,6.0,41.412,-81.850,36.080,-115.152
2850481,AA,1567641600000000000,162,1567680900000000000,1567683420000000000,7.225900e+10,92.0,0.00,29.48,10.00,...,7.227802e+10,103.0,0.00,28.74,10.0,3.0,32.896,-97.037,33.434,-112.008
2146498,F9,1572739200000000000,229,1572760860000000000,1572767400000000000,7.231401e+10,32.0,0.00,29.40,10.00,...,7.256500e+10,39.0,0.00,24.55,10.0,6.0,35.214,-80.943,39.858,-104.667
1530087,UA,1570320000000000000,151,1570353240000000000,1570365900000000000,7.238909e+10,70.0,0.00,29.72,10.00,...,7.256500e+10,56.0,0.00,24.86,10.0,5.0,36.776,-119.718,39.858,-104.667
3241252,DL,1569110400000000000,107,1569190800000000000,1569197220000000000,7.253709e+10,78.0,0.00,29.14,10.00,...,7.250701e+10,65.0,0.00,29.91,10.0,3.0,42.212,-83.353,41.724,-71.428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3606642,WN,1565222400000000000,180,1565291400000000000,1565298600000000000,7.478301e+10,78.0,0.00,29.99,10.00,...,7.225801e+10,90.0,0.00,29.35,10.0,8.0,26.072,-80.153,32.847,-96.852
1320101,DL,1561507200000000000,309,1561582500000000000,1561611840000000000,9.119002e+10,81.0,0.00,29.86,10.00,...,7.229502e+10,63.0,0.00,29.63,9.0,0.0,20.899,-156.431,33.942,-118.408
1960516,AA,1571961600000000000,96,1571995800000000000,1572001560000000000,7.240501e+10,58.0,0.00,30.23,10.00,...,7.250901e+10,63.0,0.00,30.26,10.0,3.0,38.852,-77.037,42.364,-71.005
1442810,B6,1569974400000000000,159,1570013340000000000,1570022880000000000,7.220501e+10,89.0,0.00,29.87,10.00,...,7.250379e+10,77.0,0.18,29.34,1.5,5.0,28.429,-81.316,41.067,-73.707


In [147]:
X_train_cln = clf.fit_transform(X_train)
X_test_cln = clf.transform(X_test)

In [148]:
X_train_cln

array([[ 0.        ,  0.        ,  0.        , ...,  0.67747846,
        -0.12727176, -1.0837529 ],
       [ 1.        ,  0.        ,  0.        , ..., -0.12578182,
        -0.57356813, -0.91748761],
       [ 0.        ,  0.        ,  0.        , ...,  0.72545087,
         0.50995715, -0.52927086],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.93204432,
         0.93263996,  1.25088884],
       [ 0.        ,  0.        ,  1.        , ...,  0.70572242,
         0.71387715,  1.107998  ],
       [ 0.        ,  0.        ,  0.        , ..., -0.52934249,
         0.32273532,  0.22733139]])

In [149]:
X_train_cln.shape, X_test_cln.shape

((4238010, 30), (470890, 30))

In [150]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [151]:
# Fitting the model
rf_model = rf_model.fit(X_train_cln, y_train_cln)

In [152]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_cln)

In [153]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test_cln, predictions)

In [158]:
# Create a DataFrame from the confusion matrix.
# Generate confusion_matrix
cm_arr = confusion_matrix(y_test_cln, predictions)

mult_ix = pd.MultiIndex.from_tuples([
    ('actual','cancelled'),
    ('actual','not cancelled')
])

mult_cols = pd.MultiIndex.from_tuples([
    ('predicted','cancelled'),
    ('predicted','not cancelled')
])

cm_df = pd.DataFrame(
    data=cm_arr,
    index=mult_ix,
    columns=mult_cols
)

cm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,cancelled,not cancelled
actual,cancelled,467233,261
actual,not cancelled,2616,780


In [155]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test_cln, predictions)

In [160]:
import dataframe_image as dfi

In [161]:
dfi.export(cm_df,os.path.join('.','images','mlm_t05_random_forest_r_confusion_matrix.png'))

[0911/202818.236206:INFO:headless_shell.cc(660)] Written to file /var/folders/2r/t55htrbs3ynckzlh1s23dyqr0000gn/T/tmpt06tfjrz/temp.png.


In [162]:
# Displaying results
print(f"Accuracy Score : {acc_score}\n\n")
print("Classification Report\n")
print(classification_report(y_test, predictions))

Accuracy Score : 0.9938902928497101


Classification Report

              precision    recall  f1-score   support

       False       0.99      1.00      1.00    467494
        True       0.75      0.23      0.35      3396

    accuracy                           0.99    470890
   macro avg       0.87      0.61      0.67    470890
weighted avg       0.99      0.99      0.99    470890

