In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, max_error, r2_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

%config InlineBackend.figure_format = 'svg'

In [2]:
numerical_features = ['Month',
                      'DayofMonth',
                      'DayOfWeek',
                      'DepTime',
                      'ArrTime',
                      'FlightNum',
                      'ActualElapsedTime']
target_feature = ['DepDelay']

categorical_features = ["Origin", "Dest", "TailNum"]

### Use year 2003 for train

In [3]:
data_train = pd.read_csv("2003.csv", usecols=numerical_features + categorical_features + target_feature)
data_train.shape

(6488540, 11)

In [5]:
data_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,DepDelay,Origin,Dest
0,1,29,3,1651.0,1912.0,1017,N202UA,141.0,-4.0,ORD,MSY
1,1,30,4,1654.0,1910.0,1017,N311UA,136.0,-1.0,ORD,MSY
2,1,31,5,1724.0,1936.0,1017,N317UA,132.0,29.0,ORD,MSY
3,1,1,3,1033.0,1625.0,1018,N409UA,232.0,-2.0,OAK,ORD
4,1,2,4,1053.0,1726.0,1018,N496UA,273.0,18.0,OAK,ORD


In [6]:
data_train.dropna(inplace=True)
data_train.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6375631 entries, 0 to 6488539
Data columns (total 11 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Month              6375631 non-null  int64  
 1   DayofMonth         6375631 non-null  int64  
 2   DayOfWeek          6375631 non-null  int64  
 3   DepTime            6375631 non-null  float64
 4   ArrTime            6375631 non-null  float64
 5   FlightNum          6375631 non-null  int64  
 6   TailNum            6375631 non-null  object 
 7   ActualElapsedTime  6375631 non-null  float64
 8   DepDelay           6375631 non-null  float64
 9   Origin             6375631 non-null  object 
 10  Dest               6375631 non-null  object 
dtypes: float64(4), int64(4), object(3)
memory usage: 583.7+ MB


#### Drop target column

In [7]:
y_train = data_train['DepDelay']
data_train.drop(['DepDelay'], axis=1, inplace=True)
data_train.shape, y_train.shape

((6375631, 10), (6375631,))

### Use year `2004` for test

In [12]:
data_test = pd.read_csv("2004.csv", usecols=numerical_features + categorical_features + target_feature)
data_test.shape

(7129270, 11)

In [13]:
data_test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,DepDelay,Origin,Dest
0,1,12,1,623.0,901.0,462,N805UA,98.0,-7.0,ORD,CLT
1,1,13,2,621.0,911.0,462,N851UA,110.0,-9.0,ORD,CLT
2,1,14,3,633.0,920.0,462,N436UA,107.0,3.0,ORD,CLT
3,1,15,4,627.0,859.0,462,N828UA,92.0,-3.0,ORD,CLT
4,1,16,5,635.0,918.0,462,N831UA,103.0,5.0,ORD,CLT


In [14]:
data_test.dropna(inplace=True)
data_test.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6987729 entries, 0 to 7129269
Data columns (total 11 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Month              6987729 non-null  int64  
 1   DayofMonth         6987729 non-null  int64  
 2   DayOfWeek          6987729 non-null  int64  
 3   DepTime            6987729 non-null  float64
 4   ArrTime            6987729 non-null  float64
 5   FlightNum          6987729 non-null  int64  
 6   TailNum            6987729 non-null  object 
 7   ActualElapsedTime  6987729 non-null  float64
 8   DepDelay           6987729 non-null  float64
 9   Origin             6987729 non-null  object 
 10  Dest               6987729 non-null  object 
dtypes: float64(4), int64(4), object(3)
memory usage: 639.7+ MB


#### Drop target column

In [15]:
y_test = data_test['DepDelay']
data_test.drop('DepDelay', axis=1, inplace=True)
data_test.shape, y_test.shape

((6987729, 10), (6987729,))

### Setup pipelines and transformers

In [16]:
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
data_transformer = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)])

### Metrics

[Zoo](https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics) of regression metrics

In [17]:
def print_metrics(y_true, y_hat):
    print("MSE:", mean_squared_error(y_true, y_hat))
    print("MAE:", mean_absolute_error(y_true, y_hat))
    print("Median AE:", median_absolute_error(y_true, y_hat))
    print("Max error:", max_error(y_true, y_hat))
    print("r2 score:", r2_score(y_true, y_hat))

### Linear regression

In [18]:
%%time
pipe_lr = Pipeline(steps=[('data_transformer', data_transformer),
                      ('pipe_lr', LinearRegression())])
pipe_lr.fit(data_train, y_train)

CPU times: user 4min 32s, sys: 52.7 s, total: 5min 25s
Wall time: 6min 38s


In [19]:
print_metrics(y_test, pipe_lr.predict(data_test))

MSE: 845.7965697561458
MAE: 13.546633715036355
Median AE: 7.478594157593965
Max error: 1848.3688303016118
r2 score: 0.046895543802038975


### Ridge regression

In [20]:
%%time
pipe_ridge = Pipeline(steps=[('data_transformer', data_transformer),
                           ('pipe_lr', Ridge())])
pipe_ridge.fit(data_train, y_train)

CPU times: user 2min 40s, sys: 24.5 s, total: 3min 4s
Wall time: 3min 52s


In [21]:
print_metrics(y_test, pipe_ridge.predict(data_test))

MSE: 845.0983858262819
MAE: 13.541295875750798
Median AE: 7.4744785814727175
Max error: 1848.32677032818
r2 score: 0.047682307710281036


### Lasso

In [23]:
%%time
pipe_lasso = Pipeline(steps=[('data_transformer', data_transformer),
                           ('pipe_lr', Lasso())])
pipe_lasso.fit(data_train, y_train)

CPU times: user 2min 5s, sys: 9.04 s, total: 2min 14s
Wall time: 2min 40s


In [24]:
print_metrics(y_test, pipe_lasso.predict(data_test))

MSE: 865.4527000880953
MAE: 13.66589282357993
Median AE: 7.8404457997839625
Max error: 1870.903742898918
r2 score: 0.02474560127343506


### Random forest

In [25]:
%%time
pipe_rf = Pipeline(steps=[('data_transformer', data_transformer),
                          ('pipe_lr', RandomForestRegressor(max_depth=5))])
pipe_rf.fit(data_train, y_train)

CPU times: user 51min 24s, sys: 1min 51s, total: 53min 15s
Wall time: 1h 14min 28s


In [26]:
print_metrics(y_test, pipe_rf.predict(data_test))

MSE: 782.3241427380226
MAE: 13.197204080592746
Median AE: 7.157594635583983
Max error: 1879.1856421712325
r2 score: 0.11842084338337311


In [50]:
%%time
RF = RandomForestRegressor(n_estimators=200, max_depth=10)
RF.fit(X_train, y_train)

CPU times: user 10min 29s, sys: 11 s, total: 10min 40s
Wall time: 11min 38s


In [52]:
print_metrics(y_test, RF.predict(X_test))

MSE: 469.5086945872803
MAE: 9.868816288282817
Median AE: 5.564077931470474
Max error: 1430.064558664208
r2 score: 0.2012219532024323


### Gradient boosting

In [40]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(data_train[categorical_features])

array([[ 200.,  188.,  637.],
       [ 200.,  188., 1102.],
       [ 200.,  188., 1162.],
       ...,
       [  16.,  154., 1016.],
       [ 239.,  245.,  274.],
       [ 248.,  165.,  274.]])

In [41]:
ordinal_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder())])
data_transformer_1 = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', ordinal_transformer, categorical_features)])

In [42]:
%%time
pipe_hgb = Pipeline(steps=[('data_transformer', data_transformer_1),
                          ('pipe_hgm', HistGradientBoostingRegressor(categorical_features=[6, 8, 9]))])
pipe_hgb.fit(data_train, y_train)

ValueError: Categorical feature at index 6 is expected to have a cardinality <= 255

In [22]:
print_metrics(y_test, HGB.predict(X_test))

MSE: 472.577426764951
MAE: 9.78870596423994
Median AE: 5.566169344134824
Max error: 1427.9960183558671
r2 score: 0.19600110016332206


In [44]:
%%time
pipe_gb = Pipeline(steps=[('data_transformer', data_transformer),
                          ('pipe_lr', GradientBoostingRegressor())])
pipe_gb.fit(data_train, y_train)

CPU times: user 36min 58s, sys: 1min 20s, total: 38min 18s
Wall time: 41min 39s


In [45]:
print_metrics(y_test, pipe_gb.predict(data_test))

MSE: 764.7857893740202
MAE: 13.067924351464264
Median AE: 7.0649640551703055
Max error: 1881.5805953969023
r2 score: 0.1381843223845053
