# Retrieve Data from Database

In order to connect to the database, first, make sure:
1. …you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.
2. …the `airports` and `flights_and_weather` tables have data.

If the `airports` and `flights_and_weather` tables do not have data, you must first run `/download_and_prepare_datasets/02_prepare_and_store_data.ipynb` to populate the database.

In [1]:
import os
import time

In [2]:
os.chdir(os.path.join('..','resources'))

## Connect to Database

In [3]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is already in `cfg` (above). However, you will have to enter your password below.

In [4]:
password = getpass('Enter database password')

Enter database password········


In [5]:
start_time = time.time()

In [6]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [7]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

## Access Database Tables and Create Table Aliases

In [8]:
db_meta = db.MetaData()

In [9]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)
faw_table = db.Table('flights_and_weather', db_meta, autoload_with=engine)
faw = faw_table.alias('faw')
orig = airports_table.alias('orig')
dest = airports_table.alias('dest')

## Extract Full, Joined Dataset to Pandas Dataframe

In [10]:
from io import StringIO
import pandas as pd

In [11]:
# Function to extract from database query to Pandas dataframe
# Adapted from <https://towardsdatascience.com/optimizing-pandas-read-sql-for-postgres-f31cd7f707ab>
def read_sql_inmem(query, db_engine, **kwargs):
    copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
       query=query, head="HEADER"
    )
    conn = db_engine.raw_connection()
    cur = conn.cursor()
    store = StringIO()
    cur.copy_expert(copy_sql, store)
    store.seek(0)
    df = pd.read_csv(store, **kwargs)
    return df

In [12]:
# The query composed as an SQLAlchemy table object
get_all = (
    db.select([
        faw,
        orig.c.lat_decimal.label('origin_lat'),
        orig.c.lon_decimal.label('origin_lon'),
        dest.c.lat_decimal.label('destination_lat'),
        dest.c.lon_decimal.label('destination_lon')
    ])
    .select_from(
        faw
        .join(orig, orig.c.iata_code == faw.c.origin_airport, isouter=True)
        .join(dest, dest.c.iata_code == faw.c.destination_airport, isouter=True)
    )
    .order_by(faw.c.id)
)

In [13]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

In [14]:
def df_details(df):
    return pd.concat(
        [
            df.dtypes,
            df.isna().sum(),
            pd.Series(
                data=[df[col].dropna().is_unique for col in df.columns],
                index=df.columns
            )
        ],
        axis=1,
        keys=['data_type','null_count','unique']
    )

In [15]:
# Run the query, and save the result to a dataframe
combined_df = read_sql_inmem(
    query=get_all,
#     query=get_all.limit(10).compile(engine, compile_kwargs={"literal_binds": True}),
    db_engine=engine,
    index_col='id',
    converters={'cancelled':(lambda x: True if x == 't' else False)}
)
# .sort_index(kind='mergesort')

print_shape(combined_df)

5,468,069 rows × 35 columns


In [16]:
# "Full-width display" function to display all columns of a dataframe
def fw_disp(df):
    with pd.option_context('display.max_columns',None):
        display(df)

In [17]:
fw_disp(combined_df.head())

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircarft_arrival,cancelled,scheduled_departure_dt,scheduled_arrival_dt,actual_departure_dt,actual_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,0,0,0,0,False,2019-05-01 00:40:00,2019-05-01 03:15:00,2019-05-01 00:32:00,2019-05-01 02:59:00,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,0,0,0,0,False,2019-05-01 00:59:00,2019-05-01 04:26:00,2019-05-01 01:16:00,2019-05-01 04:22:00,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,0,0,0,27,False,2019-05-01 00:50:00,2019-05-01 04:28:00,2019-05-01 01:34:00,2019-05-01 04:55:00,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,0,0,0,0,False,2019-05-01 00:55:00,2019-05-01 04:31:00,2019-05-01 01:19:00,2019-05-01 04:41:00,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,0,0,0,0,False,2019-05-01 00:10:00,2019-05-01 04:40:00,2019-05-01 00:01:00,2019-05-01 04:09:00,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [18]:
df_details(combined_df)

Unnamed: 0,data_type,null_count,unique
carrier_code,object,0,False
flight_number,int64,0,False
origin_airport,object,0,False
destination_airport,object,0,False
flight_date,object,0,False
scheduled_elapsed_time,int64,0,False
tail_number,object,6973,False
departure_delay,int64,0,False
arrival_delay,int64,0,False
delay_carrier,int64,0,False


In [19]:
print(f'Elapsed time: {(time.time() - start_time):,.3f} seconds.')

Elapsed time: 252.725 seconds.


## Prepare Data for Machine Learning

### Ensure Proper Column Structure

In [20]:
combined_df[['carrier_code','tail_number']].nunique(dropna=True)

carrier_code      10
tail_number     6111
dtype: int64

Drop the `…_airport` columns because their coordinates will serve in their stead.

Drop the `…delay…` columns because they could not be known prior to their occurrence (and because we're trying to predict cancellations based on weather, specifically).

Drop the `actual_…_dt` columns because their absence *defines* a cancellation (which is what we're trying to predict).

<s>Leaving `tail_number` in because the model may be able to make inferences based on a particular plane's earlier or later schedule.</s>  
\[Because of the large number of distinct `tail_number` values, allowing the column to remain causes memory overruns when one-hot encoding with oversampling or feature mismatches with undersampling. Drop it.\]

In [21]:
# Select columns to drop
drop_cols = [
    'flight_number',
    'origin_airport',
    'destination_airport',
    'tail_number',
    'departure_delay',
    'arrival_delay',
    'delay_carrier',
    'delay_weather',
    'delay_national_aviation_system',
    'delay_security',
    'delay_late_aircarft_arrival',
    'actual_departure_dt',
    'actual_arrival_dt'
]

In [22]:
df = combined_df.drop(columns=drop_cols)

In [23]:
fw_disp(df.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,cancelled,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,AS,2019-05-01,215,False,2019-05-01 00:40:00,2019-05-01 03:15:00,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,2019-05-01,147,False,2019-05-01 00:59:00,2019-05-01 04:26:00,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,2019-05-01,158,False,2019-05-01 00:50:00,2019-05-01 04:28:00,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,2019-05-01,156,False,2019-05-01 00:55:00,2019-05-01 04:31:00,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,2019-05-01,210,False,2019-05-01 00:10:00,2019-05-01 04:40:00,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [24]:
# Date and time columns to be converted
dt_cols = [
    'flight_date',
    'scheduled_departure_dt',
    'scheduled_arrival_dt'
]

In [25]:
df[dt_cols].head()

Unnamed: 0_level_0,flight_date,scheduled_departure_dt,scheduled_arrival_dt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2019-05-01,2019-05-01 00:40:00,2019-05-01 03:15:00
1,2019-05-01,2019-05-01 00:59:00,2019-05-01 04:26:00
2,2019-05-01,2019-05-01 00:50:00,2019-05-01 04:28:00
3,2019-05-01,2019-05-01 00:55:00,2019-05-01 04:31:00
4,2019-05-01,2019-05-01 00:10:00,2019-05-01 04:40:00


In [26]:
from datetime import datetime
import numpy as np

In [27]:
# Convert the date and time columns to Unix timestamps (integers)
for col in dt_cols:
    df[col] = pd.to_datetime(df[col]).view(np.int64)

In [28]:
df[dt_cols].dtypes

flight_date               int64
scheduled_departure_dt    int64
scheduled_arrival_dt      int64
dtype: object

In [29]:
fw_disp(df.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,cancelled,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,AS,1556668800000000000,215,False,1556671200000000000,1556680500000000000,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,1556668800000000000,147,False,1556672340000000000,1556684760000000000,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,1556668800000000000,158,False,1556671800000000000,1556684880000000000,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,1556668800000000000,156,False,1556672100000000000,1556685060000000000,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,1556668800000000000,210,False,1556669400000000000,1556685600000000000,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [30]:
# Drop rows with `NULL` values
df.dropna(how='any',inplace=True)

print_shape(df)

4,708,900 rows × 22 columns


In [31]:
# Verify that no `NULL` values remain
df.isna().sum()

carrier_code                  0
flight_date                   0
scheduled_elapsed_time        0
cancelled                     0
scheduled_departure_dt        0
scheduled_arrival_dt          0
station_x                     0
hourlydrybulbtemperature_x    0
hourlyprecipitation_x         0
hourlystationpressure_x       0
hourlyvisibility_x            0
hourlywindspeed_x             0
station_y                     0
hourlydrybulbtemperature_y    0
hourlyprecipitation_y         0
hourlystationpressure_y       0
hourlyvisibility_y            0
hourlywindspeed_y             0
origin_lat                    0
origin_lon                    0
destination_lat               0
destination_lon               0
dtype: int64

### Undersample Cancelled Flights

In [32]:
# Identify target sample size (i.e., the number of cancelled flights in the dataset)
canc_flights = df.cancelled.value_counts()[True]
canc_flights

33957

In [33]:
# Create undersampled dataframe with rows shuffled in random order
df_undersamp = pd.concat([
    df.loc[~df.cancelled].sample(canc_flights,replace=True),
    df.loc[df.cancelled]
]).sample(frac=1)

In [35]:
# Verify that df_undersamp has the twice the numer of rows as the non-cancelled flights in df
df_undersamp.index.size == 2 * canc_flights

True

In [36]:
print_shape(df_undersamp)

67,914 rows × 22 columns


### Separate the features (X) from the target (y)

In [37]:
y = df_undersamp.cancelled
X = df_undersamp.drop(columns="cancelled")

In [38]:
fw_disp(X.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4279788,WN,1572912000000000000,215,1572961500000000000,1572981600000000000,72386020000.0,77.0,0.0,27.76,10.0,3.0,72327010000.0,52.0,0.0,29.7,10.0,8.0,36.08,-115.152,36.124,-86.678
1606917,AA,1562716800000000000,185,1562779800000000000,1562798100000000000,72297790000.0,73.0,0.0,29.93,10.0,6.0,72259000000.0,75.0,0.0,29.37,10.0,3.0,33.676,-117.868,32.896,-97.037
182317,AA,1557360000000000000,87,1557384900000000000,1557393720000000000,72530090000.0,63.0,0.0,28.93,4.0,10.0,72520090000.0,75.0,0.0,28.7,10.0,10.0,41.979,-87.904,40.491,-80.233
893756,AA,1560038400000000000,170,1560114000000000000,1560117000000000000,72259000000.0,78.0,0.0,29.42,10.0,8.0,72386020000.0,80.0,0.0,27.78,10.0,7.0,32.896,-97.037,36.08,-115.152
3916512,WN,1571529600000000000,145,1571553000000000000,1571568900000000000,72278020000.0,59.0,0.0,28.65,10.0,6.0,72446000000.0,61.0,0.0,28.74,10.0,20.0,33.434,-112.008,39.297,-94.714


### Create training and testing subsets

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=1,
    test_size=0.1
)

In [41]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((61122, 21), (6792, 21), (61122,), (6792,))

### Encode Target, and Encode and Scale Features

In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

In [43]:
le = LabelEncoder()

In [44]:
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)

In [45]:
le.classes_

array([False,  True])

In [46]:
y_train_enc

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [47]:
y_test_enc

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [48]:
fw_disp(X_train.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3390545,AA,1569456000000000000,97,1569519780000000000,1569525600000000000,72438090000.0,71.0,0.0,29.1,10.0,6.0,72405010000.0,80.0,0.0,29.8,10.0,5.0,39.717,-86.294,38.852,-77.037
5061225,AA,1576022400000000000,370,1576053120000000000,1576064520000000000,72408010000.0,35.0,0.0,30.3,7.0,9.0,72295020000.0,64.0,0.0,29.84,10.0,9.0,39.872,-75.241,33.942,-118.408
4711990,AS,1574640000000000000,140,1574661600000000000,1574670000000000000,72494020000.0,50.0,0.0,30.05,10.0,10.0,72793020000.0,43.0,0.01,29.67,8.0,3.0,37.619,-122.375,47.449,-122.309
3992335,AA,1571788800000000000,107,1571838000000000000,1571844420000000000,72503010000.0,65.0,0.0,29.91,10.0,17.0,72428010000.0,62.0,0.0,29.22,10.0,11.0,40.777,-73.872,39.998,-82.892
2874597,WN,1567468800000000000,110,1567515600000000000,1567522200000000000,72203010000.0,79.0,0.0,29.6,2.5,21.0,72219010000.0,93.0,0.0,28.89,10.0,8.0,26.683,-80.096,33.64,-84.427


In [49]:
# Identify columns for one-hot encoding, min-max scaling, and standard scaling

# One-hot columns
oh_cols = [
#     'tail_number',
    'carrier_code'
]

# Min-max columns. These features are expected
# to have a somewhat even distribution
mm_cols = [
    'flight_date',
    'scheduled_departure_dt',
    'scheduled_arrival_dt',
    'origin_lat',
    'origin_lon',
    'destination_lat',
    'destination_lon'
]

# Standard-scale columns. These features are expected
# to have distributions with notable central tendencies
ss_cols = [
    'scheduled_elapsed_time',
    'station_x',
    'hourlydrybulbtemperature_x',
    'hourlyprecipitation_x',
    'hourlystationpressure_x',
    'hourlyvisibility_x',
    'hourlywindspeed_x',
    'station_y',
    'hourlydrybulbtemperature_y',
    'hourlyprecipitation_y',
    'hourlystationpressure_y',
    'hourlyvisibility_y',
    'hourlywindspeed_y'
]

In [50]:
col_xf = ColumnTransformer([
    (
        'ohe',
        OneHotEncoder(
            sparse=False,
            dtype=bool,
            handle_unknown='ignore'
        ),
        oh_cols
    ),
    (
        'mm',
        MinMaxScaler(),
        mm_cols
    ),
    (
        'ss',
        StandardScaler(),
        ss_cols
    )
], remainder='passthrough')

In [51]:
%%time
X_train_xf = col_xf.fit_transform(X_train)
X_test_xf = col_xf.fit_transform(X_test)

Wall time: 109 ms


In [52]:
X_train_xf

array([[ 1.        ,  0.        ,  0.        , ...,  0.50810999,
         0.3776767 , -0.76791983],
       [ 1.        ,  0.        ,  0.        , ...,  0.54008621,
         0.3776767 , -0.05604438],
       [ 0.        ,  1.        ,  0.        , ...,  0.40418729,
        -0.52244678, -1.12385755],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.65999701,
         0.3776767 ,  0.65583107],
       [ 0.        ,  0.        ,  0.        , ...,  0.42816945,
         0.3776767 ,  0.29989334],
       [ 1.        ,  0.        ,  0.        , ...,  0.21233   ,
         0.3776767 , -0.41198211]])

In [53]:
X_train_xf.shape, X_test_xf.shape

((61122, 30), (6792, 30))

### Create Random Forest Model

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [56]:
%%time
rf_model.fit(X_train_xf,y_train_enc)

Wall time: 17.7 s


RandomForestClassifier(n_estimators=128, random_state=1)

In [57]:
# Make predictions using the test data
y_pred = rf_model.predict(X_test_xf)

pd.DataFrame({
    "Prediction": y_pred, 
    "Cancelled": y_test_enc
}).head()
# .reset_index(drop=True)

Unnamed: 0,Prediction,Cancelled
0,0,1
1,0,0
2,1,1
3,0,0
4,0,0


### Validate the Model

In [58]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [59]:
accuracy_score(y_test_enc,y_pred)

0.8724970553592462

In [60]:
# Generate confusion_matrix
cm_arr = confusion_matrix(y_test_enc, y_pred)

mult_ix = pd.MultiIndex.from_tuples([
    ('actual','cancelled'),
    ('actual','not cancelled')
])

mult_cols = pd.MultiIndex.from_tuples([
    ('predicted','cancelled'),
    ('predicted','not cancelled')
])

cm_df = pd.DataFrame(
    data=cm_arr,
    index=mult_ix,
    columns=mult_cols
)

cm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,cancelled,not cancelled
actual,cancelled,3033,338
actual,not cancelled,528,2893


In [61]:
import dataframe_image as dfi

In [62]:
dfi.export(cm_df,os.path.join('.','images','mlm_t03_random_forest_u_confusion_matrix.png'))

In [63]:
# Generate classification report
print(classification_report(y_test_enc, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      3371
           1       0.90      0.85      0.87      3421

    accuracy                           0.87      6792
   macro avg       0.87      0.87      0.87      6792
weighted avg       0.87      0.87      0.87      6792

