In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_string_dtype, is_object_dtype, is_categorical_dtype, is_bool_dtype

def df_string_to_cat(df:pd.DataFrame) -> dict:
    catencoders = {}
    for colname in df.columns:
        if is_string_dtype(df[colname]) or is_object_dtype(df[colname]):
            df[colname] = df[colname].astype('category').cat.as_ordered()
            catencoders[colname] = df[colname].cat.categories
    return catencoders


def df_cat_to_catcode(df):
    for col in df.columns:
        if is_categorical_dtype(df[col]):
            df[col] = df[col].cat.codes + 1

In [2]:
dir = "/Users/parrt/data/flight-delays"

In [4]:
df_flights = pd.read_feather(dir+"/flights.feather")
df_flights.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [5]:
df_flights.head(5)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [6]:
# for reason in ['AIR_SYSTEM_DELAY', 'WEATHER_DELAY', 'SECURITY_DELAY']:
#     df_flights[reason] = df_flights[reason].fillna(False)

In [7]:
df_flights['dayofyear'] = pd.to_datetime(df_flights[['YEAR','MONTH', 'DAY']]).dt.dayofyear
df_flights = df_flights[(df_flights['CANCELLED']==0) & (df_flights['DIVERTED']==0)]
df_flights['DELAY'] = df_flights['ARRIVAL_DELAY']
# + \
#                       df_flights['DEPARTURE_DELAY'] + \
#                       df_flights['AIRLINE_DELAY'] +\
#                         df_flights['SECURITY_DELAY'] + \
#                         df_flights['AIR_SYSTEM_DELAY'] + \
#                         df_flights['LATE_AIRCRAFT_DELAY'] + \
#                         df_flights['ARRIVAL_DELAY']

In [8]:
features = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'dayofyear',
            'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
            'SCHEDULED_DEPARTURE', 'FLIGHT_NUMBER','TAIL_NUMBER',
            'AIR_TIME', 'DISTANCE',
            'TAXI_IN', 'TAXI_OUT',
            'DEPARTURE_TIME',
            'SCHEDULED_ARRIVAL',
            #'ARRIVAL_TIME',
            'SCHEDULED_TIME',
            #'ELAPSED_TIME',
            'DELAY'] # target

In [9]:
n = 80_000
df_flights = df_flights[features]
df_flights = df_flights.dropna() # ignore missing stuff for ease and reduce size
df_flights = df_flights.sample(n)

In [10]:
len(df_flights)

80000

In [11]:
df_string_to_cat(df_flights)
df_cat_to_catcode(df_flights)

X, y = df_flights.drop('DELAY', axis=1), df_flights['DELAY']
X.head(5)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,dayofyear,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,FLIGHT_NUMBER,TAIL_NUMBER,AIR_TIME,DISTANCE,TAXI_IN,TAXI_OUT,DEPARTURE_TIME,SCHEDULED_ARRIVAL,SCHEDULED_TIME
4153525,2015,9,16,3,259,14,535,425,850,710,3097,48.0,308,10.0,8.0,852.0,1005,75.0
567855,2015,2,7,6,38,2,394,485,1225,834,2316,282.0,2603,4.0,20.0,1313.0,1959,334.0
3381665,2015,7,30,4,211,14,541,532,1105,3586,697,53.0,342,3.0,14.0,1122.0,1215,70.0
3009644,2015,7,8,3,189,11,455,401,1153,1272,2816,120.0,964,5.0,11.0,1404.0,1341,168.0
3048794,2015,7,10,5,191,1,468,487,1612,1929,3072,131.0,980,3.0,11.0,1607.0,1950,158.0


In [12]:
rf = RandomForestRegressor(n_estimators=40, oob_score=True, n_jobs=-1)
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                      oob_score=True, random_state=None, verbose=0,
                      warm_start=False)

In [13]:
rf.oob_score_

0.8416094765587485