In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, max_error, r2_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor

%config InlineBackend.figure_format = 'svg'

In [3]:
data = pd.read_csv("1987.csv")
data.shape

(1311826, 29)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311826 entries, 0 to 1311825
Data columns (total 29 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Year               1311826 non-null  int64  
 1   Month              1311826 non-null  int64  
 2   DayofMonth         1311826 non-null  int64  
 3   DayOfWeek          1311826 non-null  int64  
 4   DepTime            1292141 non-null  float64
 5   CRSDepTime         1311826 non-null  int64  
 6   ArrTime            1288326 non-null  float64
 7   CRSArrTime         1311826 non-null  int64  
 8   UniqueCarrier      1311826 non-null  object 
 9   FlightNum          1311826 non-null  int64  
 10  TailNum            0 non-null        float64
 11  ActualElapsedTime  1288326 non-null  float64
 12  CRSElapsedTime     1311826 non-null  int64  
 13  AirTime            0 non-null        float64
 14  ArrDelay           1288326 non-null  float64
 15  DepDelay           1292141 non-n

### Take only numeric features

In [4]:
df_numeric = data[['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'DepTime', 'CRSArrTime', 'ArrTime',
          'FlightNum', 'CRSElapsedTime', 'ActualElapsedTime', 'Distance', 'ArrDelay', 'DepDelay']]

In [5]:
df_numeric.corr()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,DepTime,CRSArrTime,ArrTime,FlightNum,CRSElapsedTime,ActualElapsedTime,Distance,ArrDelay,DepDelay
Year,,,,,,,,,,,,,,
Month,,1.0,-0.003943,-0.039154,0.001184,0.006865,0.000669,-0.0005,-0.000245,0.014793,0.020025,0.005771,0.126794,0.120225
DayofMonth,,-0.003943,1.0,-0.001904,-0.004541,8e-06,-0.003757,-0.003006,0.000252,0.006444,0.005352,0.004116,0.069229,0.076482
DayOfWeek,,-0.039154,-0.001904,1.0,0.006143,0.003751,0.007445,0.004139,0.000678,0.00928,0.000341,0.009982,-0.038914,-0.018291
CRSDepTime,,0.001184,-0.004541,0.006143,1.0,0.978375,0.814097,0.761478,0.017621,-0.043825,-0.046441,-0.050215,0.088362,0.100831
DepTime,,0.006865,8e-06,0.003751,0.978375,1.0,0.81898,0.77638,0.017926,-0.041456,-0.043208,-0.049142,0.116684,0.132768
CRSArrTime,,0.000669,-0.003757,0.007445,0.814097,0.81898,1.0,0.926906,0.002499,0.056439,0.052915,0.046858,0.086463,0.100121
ArrTime,,-0.0005,-0.003006,0.004139,0.761478,0.77638,0.926906,1.0,0.003181,0.048506,0.049205,0.038306,0.064226,0.068323
FlightNum,,-0.000245,0.000252,0.000678,0.017621,0.017926,0.002499,0.003181,1.0,-0.150226,-0.142422,-0.150437,0.039494,0.029196
CRSElapsedTime,,0.014793,0.006444,0.00928,-0.043825,-0.041456,0.056439,0.048506,-0.150226,1.0,0.983088,0.984491,0.035516,0.059592


### Drop highly correlated features

In [6]:
df_numeric.drop(["Year", "CRSDepTime", "CRSArrTime", "CRSElapsedTime", "Distance", "ArrDelay"], axis=1, inplace=True)
df_numeric.corr()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.drop(["Year", "CRSDepTime", "CRSArrTime", "CRSElapsedTime", "Distance", "ArrDelay"], axis=1, inplace=True)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,FlightNum,ActualElapsedTime,DepDelay
Month,1.0,-0.003943,-0.039154,0.006865,-0.0005,-0.000245,0.020025,0.120225
DayofMonth,-0.003943,1.0,-0.001904,8e-06,-0.003006,0.000252,0.005352,0.076482
DayOfWeek,-0.039154,-0.001904,1.0,0.003751,0.004139,0.000678,0.000341,-0.018291
DepTime,0.006865,8e-06,0.003751,1.0,0.77638,0.017926,-0.043208,0.132768
ArrTime,-0.0005,-0.003006,0.004139,0.77638,1.0,0.003181,0.049205,0.068323
FlightNum,-0.000245,0.000252,0.000678,0.017926,0.003181,1.0,-0.142422,0.029196
ActualElapsedTime,0.020025,0.005352,0.000341,-0.043208,0.049205,-0.142422,1.0,0.063992
DepDelay,0.120225,0.076482,-0.018291,0.132768,0.068323,0.029196,0.063992,1.0


### Drop missing values (not always the best choice)

In [7]:
df_numeric.dropna(inplace=True)
df_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1288326 entries, 0 to 1311825
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Month              1288326 non-null  int64  
 1   DayofMonth         1288326 non-null  int64  
 2   DayOfWeek          1288326 non-null  int64  
 3   DepTime            1288326 non-null  float64
 4   ArrTime            1288326 non-null  float64
 5   FlightNum          1288326 non-null  int64  
 6   ActualElapsedTime  1288326 non-null  float64
 7   DepDelay           1288326 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 88.5 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.dropna(inplace=True)


### Delays is our target

In [8]:
y = df_numeric['DepDelay']
X = df_numeric.drop('DepDelay', axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1003)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1030660, 7), (257666, 7), (1030660,), (257666,))

### Metrics

[Zoo](https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics) of regression metrics

In [13]:
def print_metrics(y_true, y_hat):
    print("MSE:", mean_squared_error(y_true, y_hat))
    print("MAE:", mean_absolute_error(y_true, y_hat))
    print("Median AE:", median_absolute_error(y_true, y_hat))
    print("Max error:", max_error(y_true, y_hat))
    print("r2 score:", r2_score(y_true, y_hat))

### Logistic regression

In [10]:
%%time
LR = LinearRegression()
LR.fit(X_train, y_train)

CPU times: user 430 ms, sys: 240 ms, total: 670 ms
Wall time: 1.06 s


In [11]:
y_hat = LR.predict(X_test)
y_hat

array([10.02494659, 11.03639358,  4.65083372, ..., -2.93233498,
        0.32299397,  2.2062454 ])

In [14]:
print_metrics(y_test, y_hat)

MSE: 560.3349913076179
MAE: 10.767753450850934
Median AE: 7.077863637575387
Max error: 1428.4077125983138
r2 score: 0.04669861267962805


### Ridge regression

In [15]:
%%time
RR = Ridge()
RR.fit(X_train, y_train)

CPU times: user 109 ms, sys: 22.8 ms, total: 131 ms
Wall time: 139 ms


In [16]:
print_metrics(y_test, RR.predict(X_test))

MSE: 560.3349910777355
MAE: 10.76775320734124
Median AE: 7.077863582438326
Max error: 1428.4077126475138
r2 score: 0.04669861307072842


### Lasso

In [17]:
%%time
LAS = Lasso(0.1)
LAS.fit(X_train, y_train)

CPU times: user 1.73 s, sys: 98.3 ms, total: 1.82 s
Wall time: 1.62 s


In [18]:
print_metrics(y_test, LAS.predict(X_test))

MSE: 560.3329256416811
MAE: 10.759896578963591
Median AE: 7.077110759558966
Max error: 1428.4264141995284
r2 score: 0.04670212700986587


### Random forest

In [19]:
%%time
RF = RandomForestRegressor(max_depth=5)
RF.fit(X_train, y_train)

CPU times: user 2min 58s, sys: 4.55 s, total: 3min 2s
Wall time: 3min 37s


In [20]:
print_metrics(y_test, RF.predict(X_test))

MSE: 531.7615033144156
MAE: 10.35988135844266
Median AE: 5.989314238470792
Max error: 1427.11841842082
r2 score: 0.09531086457725724


In [50]:
%%time
RF = RandomForestRegressor(n_estimators=200, max_depth=10)
RF.fit(X_train, y_train)

CPU times: user 10min 29s, sys: 11 s, total: 10min 40s
Wall time: 11min 38s


In [52]:
print_metrics(y_test, RF.predict(X_test))

MSE: 469.5086945872803
MAE: 9.868816288282817
Median AE: 5.564077931470474
Max error: 1430.064558664208
r2 score: 0.2012219532024323


### Gradient boosting

In [21]:
%%time
HGB = HistGradientBoostingRegressor()
HGB.fit(X_train, y_train)

CPU times: user 16.8 s, sys: 2.7 s, total: 19.5 s
Wall time: 9.88 s


In [22]:
print_metrics(y_test, HGB.predict(X_test))

MSE: 472.577426764951
MAE: 9.78870596423994
Median AE: 5.566169344134824
Max error: 1427.9960183558671
r2 score: 0.19600110016332206


In [58]:
%%time
GB = GradientBoostingRegressor()
GB.fit(X_train, y_train)

CPU times: user 2min 35s, sys: 2.78 s, total: 2min 38s
Wall time: 2min 47s


In [59]:
print_metrics(y_test, GB.predict(X_test))

MSE: 524.116542173957
MAE: 10.156510664930877
Median AE: 6.031627314447118
Max error: 1426.5718826224845
r2 score: 0.10831728426238518
