# Retrieve Data from Database

In order to connect to the database, first, make sure:
1. …you have a local (running) database with the `hostname`, `database` name, `username`, and `port` number as specified in `/resources/config/gadb_pg_config.py`.
2. …the `airports` and `flights_and_weather` tables have data.

If the `airports` and `flights_and_weather` tables do not have data, you must first run `/download_and_prepare_datasets/02_prepare_and_store_data.ipynb` to populate the database.

In [1]:
import os
import time

In [2]:
os.chdir(os.path.join('..','resources'))

## Connect to Database

In [3]:
# Database configuration details
from config import gadb_pg_config as cfg

# To connect to SQL database
import sqlalchemy as db
# from sqlalchemy import create_engine, MetaData, Table, text, types

# To enter passwords without exposing them
from getpass import getpass

Most of the database information is already in `cfg` (above). However, you will have to enter your password below.

In [4]:
password = getpass('Enter database password')

Enter database password········


In [5]:
start_time = time.time()

In [6]:
db_string = f"postgresql+psycopg2://{cfg.username}:{password}@{cfg.hostname}:{cfg.port}/{cfg.database}"

In [7]:
engine = db.create_engine(
    future=True,
#     echo=True,
    url=db_string
)

## Access Database Tables and Create Table Aliases

In [8]:
db_meta = db.MetaData()

In [9]:
# Access the `airports` and `flights_and_weather` tables and assign them to variables
airports_table = db.Table('airports', db_meta, autoload_with=engine)
faw_table = db.Table('flights_and_weather', db_meta, autoload_with=engine)
faw = faw_table.alias('faw')
orig = airports_table.alias('orig')
dest = airports_table.alias('dest')

## Extract Full, Joined Dataset to Pandas Dataframe

In [10]:
from io import StringIO
import pandas as pd

In [11]:
# Function to extract from database query to Pandas dataframe
# Adapted from <https://towardsdatascience.com/optimizing-pandas-read-sql-for-postgres-f31cd7f707ab>
def read_sql_inmem(query, db_engine, **kwargs):
    copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
       query=query, head="HEADER"
    )
    conn = db_engine.raw_connection()
    cur = conn.cursor()
    store = StringIO()
    cur.copy_expert(copy_sql, store)
    store.seek(0)
    df = pd.read_csv(store, **kwargs)
    return df

In [12]:
# The query composed as an SQLAlchemy table object
get_all = (
    db.select([
        faw,
        orig.c.lat_decimal.label('origin_lat'),
        orig.c.lon_decimal.label('origin_lon'),
        dest.c.lat_decimal.label('destination_lat'),
        dest.c.lon_decimal.label('destination_lon')
    ])
    .select_from(
        faw
        .join(orig, orig.c.iata_code == faw.c.origin_airport, isouter=True)
        .join(dest, dest.c.iata_code == faw.c.destination_airport, isouter=True)
    )
    .order_by(faw.c.id)
)

In [13]:
def print_shape(df):
    df_rows, df_cols = df.shape
    print(f"{df_rows:,} rows × {df_cols:,} columns")

In [14]:
def df_details(df):
    return pd.concat(
        [
            df.dtypes,
            df.isna().sum(),
            pd.Series(
                data=[df[col].dropna().is_unique for col in df.columns],
                index=df.columns
            )
        ],
        axis=1,
        keys=['data_type','null_count','unique']
    )

In [15]:
# Run the query, and save the result to a dataframe
combined_df = read_sql_inmem(
    query=get_all,
#     query=get_all.limit(10).compile(engine, compile_kwargs={"literal_binds": True}),
    db_engine=engine,
    index_col='id',
    converters={'cancelled':(lambda x: True if x == 't' else False)}
)
# .sort_index(kind='mergesort')

print_shape(combined_df)

5,468,069 rows × 35 columns


In [16]:
# "Full-width display" function to display all columns of a dataframe
def fw_disp(df):
    with pd.option_context('display.max_columns',None):
        display(df)

In [17]:
fw_disp(combined_df.head())

Unnamed: 0_level_0,carrier_code,flight_number,origin_airport,destination_airport,flight_date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircarft_arrival,cancelled,scheduled_departure_dt,scheduled_arrival_dt,actual_departure_dt,actual_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,AS,121,SEA,ANC,2019-05-01,215,N615AS,-8,-16,0,0,0,0,0,False,2019-05-01 00:40:00,2019-05-01 03:15:00,2019-05-01 00:32:00,2019-05-01 02:59:00,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,402,LAX,DEN,2019-05-01,147,N701FR,17,-4,0,0,0,0,0,False,2019-05-01 00:59:00,2019-05-01 04:26:00,2019-05-01 01:16:00,2019-05-01 04:22:00,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,662,SFO,DEN,2019-05-01,158,N346FR,44,27,0,0,0,0,27,False,2019-05-01 00:50:00,2019-05-01 04:28:00,2019-05-01 01:34:00,2019-05-01 04:55:00,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,790,PDX,DEN,2019-05-01,156,N332FR,24,10,0,0,0,0,0,False,2019-05-01 00:55:00,2019-05-01 04:31:00,2019-05-01 01:19:00,2019-05-01 04:41:00,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,108,ANC,SEA,2019-05-01,210,N548AS,-9,-31,0,0,0,0,0,False,2019-05-01 00:10:00,2019-05-01 04:40:00,2019-05-01 00:01:00,2019-05-01 04:09:00,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [18]:
df_details(combined_df)

Unnamed: 0,data_type,null_count,unique
carrier_code,object,0,False
flight_number,int64,0,False
origin_airport,object,0,False
destination_airport,object,0,False
flight_date,object,0,False
scheduled_elapsed_time,int64,0,False
tail_number,object,6973,False
departure_delay,int64,0,False
arrival_delay,int64,0,False
delay_carrier,int64,0,False


In [19]:
print(f'Elapsed time: {(time.time() - start_time):,.3f} seconds.')

Elapsed time: 154.969 seconds.


## Prepare Data for Machine Learning

### Ensure Proper Column Structure

In [20]:
# Select columns to drop
drop_cols = [
    'flight_number',
    'origin_airport',
    'destination_airport',
#     'tail_number',
    'departure_delay',
    'arrival_delay',
    'delay_carrier',
    'delay_weather',
    'delay_national_aviation_system',
    'delay_security',
    'delay_late_aircarft_arrival',
    'actual_departure_dt',
    'actual_arrival_dt'
]

Drop the `…_airport` columns because their coordinates will serve in their stead.

Drop the `…delay…` columns because they could not be known prior to their occurrence (and because we're trying to predict cancellations based on weather, specifically).

Drop the `actual_…_dt` columns because their absence *defines* a cancellation (which is what we're trying to predict).

Leaving `tail_number` in because the model may be able to make inferences based on a particular plane's earlier or later schedule.

In [21]:
df = combined_df.drop(columns=drop_cols)

In [22]:
fw_disp(df.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,tail_number,cancelled,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,AS,2019-05-01,215,N615AS,False,2019-05-01 00:40:00,2019-05-01 03:15:00,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,2019-05-01,147,N701FR,False,2019-05-01 00:59:00,2019-05-01 04:26:00,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,2019-05-01,158,N346FR,False,2019-05-01 00:50:00,2019-05-01 04:28:00,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,2019-05-01,156,N332FR,False,2019-05-01 00:55:00,2019-05-01 04:31:00,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,2019-05-01,210,N548AS,False,2019-05-01 00:10:00,2019-05-01 04:40:00,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [23]:
# Date and time columns to be converted
dt_cols = [
    'flight_date',
    'scheduled_departure_dt',
    'scheduled_arrival_dt'
]

In [24]:
df[dt_cols].head()

Unnamed: 0_level_0,flight_date,scheduled_departure_dt,scheduled_arrival_dt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2019-05-01,2019-05-01 00:40:00,2019-05-01 03:15:00
1,2019-05-01,2019-05-01 00:59:00,2019-05-01 04:26:00
2,2019-05-01,2019-05-01 00:50:00,2019-05-01 04:28:00
3,2019-05-01,2019-05-01 00:55:00,2019-05-01 04:31:00
4,2019-05-01,2019-05-01 00:10:00,2019-05-01 04:40:00


In [25]:
from datetime import datetime
import numpy as np

In [26]:
# Convert the date and time columns to Unix timestamps (integers)
for col in dt_cols:
    df[col] = pd.to_datetime(df[col]).view(np.int64)

In [27]:
df[dt_cols].dtypes

flight_date               int64
scheduled_departure_dt    int64
scheduled_arrival_dt      int64
dtype: object

In [28]:
fw_disp(df.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,tail_number,cancelled,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,AS,1556668800000000000,215,N615AS,False,1556671200000000000,1556680500000000000,72793020000.0,49.0,0.0,29.59,10.0,8.0,70272530000.0,42.0,0.0,30.16,10.0,3.0,47.449,-122.309,61.174,-149.996
1,F9,1556668800000000000,147,N701FR,False,1556672340000000000,1556684760000000000,72295020000.0,58.0,0.0,29.65,10.0,3.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,33.942,-118.408,39.858,-104.667
2,F9,1556668800000000000,158,N346FR,False,1556671800000000000,1556684880000000000,72494020000.0,51.0,0.0,29.98,10.0,6.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,37.619,-122.375,39.858,-104.667
3,F9,1556668800000000000,156,N332FR,False,1556672100000000000,1556685060000000000,72698020000.0,48.0,0.0,29.98,10.0,0.0,72565000000.0,34.0,0.0,24.43,4.0,0.0,45.589,-122.597,39.858,-104.667
4,AS,1556668800000000000,210,N548AS,False,1556669400000000000,1556685600000000000,70272530000.0,43.0,0.0,30.18,10.0,5.0,72793020000.0,44.0,0.0,29.58,10.0,7.0,61.174,-149.996,47.449,-122.309


In [29]:
combined_df[['carrier_code','tail_number']].nunique(dropna=True)

carrier_code      10
tail_number     6111
dtype: int64

In [30]:
# Drop rows with `NULL` values
df.dropna(how='any',inplace=True)

print_shape(df)

4,703,316 rows × 23 columns


In [31]:
# Verify that no `NULL` values remain
df.isna().sum()

carrier_code                  0
flight_date                   0
scheduled_elapsed_time        0
tail_number                   0
cancelled                     0
scheduled_departure_dt        0
scheduled_arrival_dt          0
station_x                     0
hourlydrybulbtemperature_x    0
hourlyprecipitation_x         0
hourlystationpressure_x       0
hourlyvisibility_x            0
hourlywindspeed_x             0
station_y                     0
hourlydrybulbtemperature_y    0
hourlyprecipitation_y         0
hourlystationpressure_y       0
hourlyvisibility_y            0
hourlywindspeed_y             0
origin_lat                    0
origin_lon                    0
destination_lat               0
destination_lon               0
dtype: int64

### Oversample Cancelled Flights

In [32]:
# Identify target sample size (i.e., the number of non-cancelled flights in the dataset)
nc_flights = df.cancelled.value_counts()[False]
nc_flights

4674943

In [33]:
# Create oversampled dataframe with rows shuffled in random order
df_oversamp = pd.concat([
    df.loc[~df.cancelled],
    df.loc[df.cancelled].sample(nc_flights,replace=True)
]).sample(frac=1)

In [34]:
# Verify that df_oversamp has the twice the numer of rows as the non-cancelled flights in df
df_oversamp.index.size == 2 * nc_flights

True

In [35]:
print_shape(df_oversamp)

9,349,886 rows × 23 columns


### Separate the features (X) from the target (y)

In [36]:
y = df_oversamp.cancelled
X = df_oversamp.drop(columns="cancelled")

In [37]:
fw_disp(X.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,tail_number,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2135401,AA,1564704000000000000,117,N521UW,1564734120000000000,1564741140000000000,72537090000.0,69.0,0.0,29.47,10.0,6.0,72314010000.0,77.0,0.0,29.3,7.0,0.0,42.212,-83.353,35.214,-80.943
1513371,AA,1562371200000000000,237,N194UW,1562454420000000000,1562479440000000000,72278020000.0,95.0,0.0,28.66,10.0,7.0,72314010000.0,75.0,0.0,29.14,10.0,5.0,33.434,-112.008,35.214,-80.943
3710902,WN,1570665600000000000,170,N234WN,1570740300000000000,1570754100000000000,72258010000.0,66.0,0.32,29.33,6.0,21.0,72406090000.0,61.0,0.0,29.98,10.0,6.0,32.847,-96.852,39.175,-76.668
1456240,WN,1562198400000000000,85,N929WN,1562224800000000000,1562229900000000000,72231010000.0,82.0,0.0,30.01,8.0,3.0,72258010000.0,84.0,0.0,29.42,10.0,11.0,29.993,-90.258,32.847,-96.852
839013,WN,1559865600000000000,115,N8640D,1559912400000000000,1559915700000000000,72565000000.0,74.0,0.0,24.48,8.0,9.0,72386020000.0,96.0,0.0,27.44,10.0,15.0,39.858,-104.667,36.08,-115.152


### Create training and testing subsets

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=1,
    test_size=0.1
)

In [41]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8414897, 22), (934989, 22), (8414897,), (934989,))

### Encode Target, and Encode and Scale Features

In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

In [43]:
le = LabelEncoder()

In [44]:
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)

In [45]:
le.classes_

array([False,  True])

In [46]:
y_train_enc

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [47]:
y_test_enc

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [50]:
X_train.columns

Index(['carrier_code', 'flight_date', 'scheduled_elapsed_time', 'tail_number',
       'scheduled_departure_dt', 'scheduled_arrival_dt', 'station_x',
       'hourlydrybulbtemperature_x', 'hourlyprecipitation_x',
       'hourlystationpressure_x', 'hourlyvisibility_x', 'hourlywindspeed_x',
       'station_y', 'hourlydrybulbtemperature_y', 'hourlyprecipitation_y',
       'hourlystationpressure_y', 'hourlyvisibility_y', 'hourlywindspeed_y',
       'origin_lat', 'origin_lon', 'destination_lat', 'destination_lon'],
      dtype='object')

In [51]:
fw_disp(X_train.head())

Unnamed: 0_level_0,carrier_code,flight_date,scheduled_elapsed_time,tail_number,scheduled_departure_dt,scheduled_arrival_dt,station_x,hourlydrybulbtemperature_x,hourlyprecipitation_x,hourlystationpressure_x,hourlyvisibility_x,hourlywindspeed_x,station_y,hourlydrybulbtemperature_y,hourlyprecipitation_y,hourlystationpressure_y,hourlyvisibility_y,hourlywindspeed_y,origin_lat,origin_lon,destination_lat,destination_lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
4831673,WN,1575072000000000000,125,N400WN,1575150300000000000,1575157800000000000,72258010000.0,60.0,0.0,29.36,10.0,14.0,72534010000.0,41.0,0.02,28.75,9.0,7.0,32.847,-96.852,41.786,-87.752
4743207,WN,1574726400000000000,75,N220WN,1574776500000000000,1574781000000000000,72258010000.0,72.0,0.0,28.99,9.0,8.0,72363020000.0,52.0,0.0,25.85,10.0,38.0,32.847,-96.852,35.219,-101.706
2348103,AA,1565481600000000000,58,N595NN,1565522400000000000,1565525880000000000,72314010000.0,84.0,0.0,29.25,9.0,3.0,72302010000.0,86.0,0.0,29.96,10.0,13.0,35.214,-80.943,34.271,-77.903
67425,WN,1556841600000000000,85,N786SW,1556919600000000000,1556924700000000000,72386020000.0,77.0,0.0,27.56,10.0,0.0,72493020000.0,53.0,0.0,29.86,10.0,8.0,36.08,-115.152,37.721,-122.221
4836652,UA,1575158400000000000,245,N160SY,1575186600000000000,1575194100000000000,72353010000.0,34.0,0.0,28.67,10.0,20.0,72494020000.0,56.0,0.0,29.87,8.0,17.0,35.393,-97.601,37.619,-122.375


In [52]:
# Identify columns for one-hot encoding, min-max scaling, and standard scaling

# One-hot columns
oh_cols = ['carrier_code','tail_number']

# Min-max columns. These features are expected
# to have a somewhat even distribution
mm_cols = [
    'flight_date',
    'scheduled_departure_dt',
    'scheduled_arrival_dt',
    'origin_lat',
    'origin_lon',
    'destination_lat',
    'destination_lon'
]

# Standard-scale columns. These features are expected
# to have distributions with notable central tendencies
ss_cols = [
    'scheduled_elapsed_time',
    'station_x',
    'hourlydrybulbtemperature_x',
    'hourlyprecipitation_x',
    'hourlystationpressure_x',
    'hourlyvisibility_x',
    'hourlywindspeed_x',
    'station_y',
    'hourlydrybulbtemperature_y',
    'hourlyprecipitation_y',
    'hourlystationpressure_y',
    'hourlyvisibility_y',
    'hourlywindspeed_y'
]

In [55]:
col_xf = ColumnTransformer([
    (
        'ohe',
        OneHotEncoder(
            sparse=False,
            dtype=int,
            handle_unknown='ignore'
        ),
        oh_cols
    ),
    (
        'mm',
        MinMaxScaler(),
        mm_cols
    ),
    (
        'ss',
        StandardScaler(),
        ss_cols
    )
], remainder='passthrough')

In [59]:
# Free up memory
for _var in [
    'df_oversamp',
    'nc_flights',
    'dt_cols',
    'df',
    'drop_cols',
    'combined_df'
]:
    if _var in locals() or _var in globals():
        exec(f'del {_var}')

In [60]:
X_train_xf = col_xf.fit_transform(X_train)
X_test_xf = col_xf.fit_transform(X_test)

MemoryError: Unable to allocate 192. GiB for an array with shape (8414897, 6118) and data type int32