In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint 

Necessary libraries are downloaded. Pandas and numpy are standard data science libraries. Sklearn is for prediction and hyperparameter tuning. Scipy library is used to generate random numbers for RandomizedSearchCV.

In [2]:
df = pd.read_csv('DelayedFlights.csv')

DelayedFlights.csv dataset is brought into a pandas dataframe.

In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,1936758.0,1936758.0,1936758.0,1936758.0,1936758.0,1936758.0,1936758.0,1929648.0,1936758.0,1936758.0,...,1936758.0,1929648.0,1936303.0,1936758.0,1936758.0,1247488.0,1247488.0,1247488.0,1247488.0,1247488.0
mean,3341651.0,2008.0,6.111106,15.75347,3.984827,1518.534,1467.473,1610.141,1634.225,2184.263,...,765.6862,6.812975,18.2322,0.0003268348,0.004003598,19.1794,3.703571,15.02164,0.09013714,25.29647
std,2066065.0,0.0,3.482546,8.776272,1.995966,450.4853,424.7668,548.1781,464.6347,1944.702,...,574.4797,5.273595,14.33853,0.01807562,0.06314722,43.54621,21.4929,33.83305,2.022714,42.05486
min,0.0,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1517452.0,2008.0,3.0,8.0,2.0,1203.0,1135.0,1316.0,1325.0,610.0,...,338.0,4.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3242558.0,2008.0,6.0,16.0,4.0,1545.0,1510.0,1715.0,1705.0,1543.0,...,606.0,6.0,14.0,0.0,0.0,2.0,0.0,2.0,0.0,8.0
75%,4972467.0,2008.0,9.0,23.0,6.0,1900.0,1815.0,2030.0,2014.0,3422.0,...,998.0,8.0,21.0,0.0,0.0,21.0,0.0,15.0,0.0,33.0
max,7009727.0,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2400.0,9742.0,...,4962.0,240.0,422.0,1.0,1.0,2436.0,1352.0,1357.0,392.0,1316.0


Exploratory data analysis. Describe method brings us count, mean, median vs. values of the dataset.

In [4]:
df.shape

(1936758, 30)

Here is the number of rows and columns.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936758 entries, 0 to 1936757
Data columns (total 30 columns):
Unnamed: 0           int64
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               float64
TaxiOut              float64
Cancelled            int64
CancellationCode     object
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
dtypes: float64(14), int64(11), object(5)
me

Exploratory data analysis. Different data types in the dataset by columns.

In [6]:
Le_UniqueCarrier=LabelEncoder()
Le_Origin=LabelEncoder()
Le_Dest=LabelEncoder()
Le_CancellationCode=LabelEncoder()

We have already detected the object type columns and now we convert those object-type columns into numerical values. Since machine learning algorithms cannot deal with strings.

In [7]:
df['UniqueCarrier_n'] = Le_UniqueCarrier.fit_transform(df['UniqueCarrier'])
df['Origin_n'] = Le_Origin.fit_transform(df['Origin'])
df['Dest_n'] = Le_Dest.fit_transform(df['Dest'])
df['CancellationCode_n'] = Le_CancellationCode.fit_transform(df['CancellationCode'])

Object-type columns are converted into numerical type columns. New numerical columns are inserted into our dataset.

In [8]:
df.shape

(1936758, 34)

4 brand new columns are added. 

In [9]:
df.drop(['UniqueCarrier', 'Origin', 'Dest', 'CancellationCode', 'TailNum', 'Unnamed: 0'], axis = 1, inplace=True)

4 Object-type columns, 1 extra index column and 1 other irrelevant (tail number) column are dropped.

In [10]:
df.shape

(1936758, 28)

The number of columns and rows are checked to see if adding and dropping are successful. So far so good.

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936758 entries, 0 to 1936757
Data columns (total 28 columns):
Year                  int64
Month                 int64
DayofMonth            int64
DayOfWeek             int64
DepTime               float64
CRSDepTime            int64
ArrTime               float64
CRSArrTime            int64
FlightNum             int64
ActualElapsedTime     float64
CRSElapsedTime        float64
AirTime               float64
ArrDelay              float64
DepDelay              float64
Distance              int64
TaxiIn                float64
TaxiOut               float64
Cancelled             int64
Diverted              int64
CarrierDelay          float64
WeatherDelay          float64
NASDelay              float64
SecurityDelay         float64
LateAircraftDelay     float64
UniqueCarrier_n       int64
Origin_n              int64
Dest_n                int64
CancellationCode_n    int64
dtypes: float64(14), int64(14)
memory usage: 413.7 MB


The data type of each column is checked.

In [12]:
df.isnull().sum()

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                 7110
CRSArrTime                 0
FlightNum                  0
ActualElapsedTime       8387
CRSElapsedTime           198
AirTime                 8387
ArrDelay                8387
DepDelay                   0
Distance                   0
TaxiIn                  7110
TaxiOut                  455
Cancelled                  0
Diverted                   0
CarrierDelay          689270
WeatherDelay          689270
NASDelay              689270
SecurityDelay         689270
LateAircraftDelay     689270
UniqueCarrier_n            0
Origin_n                   0
Dest_n                     0
CancellationCode_n         0
dtype: int64

Null values are checked.

In [13]:
df.dropna(axis=0, how='any', inplace=True)

Rows that have null values are dropped.

In [14]:
df.isnull().sum()

Year                  0
Month                 0
DayofMonth            0
DayOfWeek             0
DepTime               0
CRSDepTime            0
ArrTime               0
CRSArrTime            0
FlightNum             0
ActualElapsedTime     0
CRSElapsedTime        0
AirTime               0
ArrDelay              0
DepDelay              0
Distance              0
TaxiIn                0
TaxiOut               0
Cancelled             0
Diverted              0
CarrierDelay          0
WeatherDelay          0
NASDelay              0
SecurityDelay         0
LateAircraftDelay     0
UniqueCarrier_n       0
Origin_n              0
Dest_n                0
CancellationCode_n    0
dtype: int64

No more null values!

In [15]:
df.reset_index(drop=True, inplace=True)

We reset index to tidy up our data set.

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247488 entries, 0 to 1247487
Data columns (total 28 columns):
Year                  1247488 non-null int64
Month                 1247488 non-null int64
DayofMonth            1247488 non-null int64
DayOfWeek             1247488 non-null int64
DepTime               1247488 non-null float64
CRSDepTime            1247488 non-null int64
ArrTime               1247488 non-null float64
CRSArrTime            1247488 non-null int64
FlightNum             1247488 non-null int64
ActualElapsedTime     1247488 non-null float64
CRSElapsedTime        1247488 non-null float64
AirTime               1247488 non-null float64
ArrDelay              1247488 non-null float64
DepDelay              1247488 non-null float64
Distance              1247488 non-null int64
TaxiIn                1247488 non-null float64
TaxiOut               1247488 non-null float64
Cancelled             1247488 non-null int64
Diverted              1247488 non-null int64
CarrierDelay  

Dataset is controlled for last time and now we are ready to train our model.

In [17]:
y=df.loc[:, 'DepDelay']

Target variable is set.

In [18]:
X=df.loc[:, df.columns != 'DepDelay']

The rest of the columns will be used as predictors.

In [19]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=12)

Train and test data are splitted. random_state is used to get same result if code is implemented later again. 

In [20]:
param_grid={"n_estimators":randint(1,9),
            "max_depth": (2,7),
           "max_features": randint(1,9),
           "min_samples_leaf": randint(1,9)}

Hyperparameter tuning method is decided to choose parameters such as max_depth, n_estimators, max_features, and min_sample_leaf. 

In [21]:
RFReg=RandomForestRegressor(n_jobs=-1, random_state=42)

RandomForestRegressor method is used. n_job=-1 helps computer to use full capacity CPU. random_state is again for later implementations of the same code.

In [22]:
RFReg_cv= RandomizedSearchCV(RFReg, param_grid, cv=5)

I decided to use RandomizedSearchCV rather than plain hyper parameter tuning (GridSearchCV) to make parameter choosing process shorter.

In [23]:
RFReg_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=-1, oob_score=False,
                                                   random_state...


Training is implemented.

In [24]:
RFReg_cv.best_params_

{'max_depth': 7, 'max_features': 6, 'min_samples_leaf': 7, 'n_estimators': 5}

These parameters are chosen by the algorithm as the most optimized ones.

In [25]:
y_pred=RFReg_cv.predict(X_test)

Prediction is implemented on the test data.

In [26]:
mean_squared_error(y_test, y_pred)

337.56074148431287

Calculation of MSE.

In [27]:
rmse=mean_squared_error(y_test, y_pred)**0.5

Calculation of avarage value error.

In [28]:
rmse

18.372826170306865

It is pretty close to the value that I found by implementing linear regression, yet slightly worse than that.  

In [29]:
RFReg_cv.score(X_test, y_test)

0.9052527483794639

Calculation of R2