### Q5. Use the available variables to construct a model that predicts delays.

In [42]:
import pandas as pd

seven = pd.read_csv("2007.csv")

In [43]:
tmp = seven['ArrDelay'] + seven['DepDelay']
tmp = pd.DataFrame(tmp)
tmp.columns = ['TotalDelay']
tmp

seven = pd.concat([seven, tmp], axis = 1)

In [44]:
del(tmp)

In [45]:
DelaySet = seven[['CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
             'Distance', 'TotalDelay']]

In [46]:
pd.options.mode.chained_assignment = None

In [47]:
DelaySet.dropna(axis = 0, how = 'any', inplace = True)

DelaySet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7275288 entries, 0 to 7453214
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   CRSDepTime      int64  
 1   CRSArrTime      int64  
 2   CRSElapsedTime  float64
 3   Distance        int64  
 4   TotalDelay      float64
dtypes: float64(2), int64(3)
memory usage: 333.0 MB


In [48]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV      
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score

### Creating train and testsets

In [49]:
features = ['CRSDepTime','CRSArrTime','CRSElapsedTime','Distance']
X = DelaySet[features].copy()
y = DelaySet['TotalDelay'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [50]:
lrmodel = LinearRegression().fit(X_train , y_train)
print(lrmodel.coef_, lrmodel.intercept_)

[ 0.01520132  0.00983827  0.14675632 -0.0170109 ] -19.77042512266357


In [51]:
## Linear regression model prediction
y_lrpredict = lrmodel.predict(X_test)

mse = mean_squared_error(y_test, y_lrpredict)
rmse = np.sqrt(mse)
print(rmse)

## RMSE of 73.21979111318788
## Can be further improved via Ridge/Lasso

73.21979111318788


### Ridge regression

In [52]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

In [53]:
## Finding the optimal value of alpha for ridge regression
## 3mins
folds = KFold(n_splits=5, shuffle = True, random_state = 42)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
## Tuning parameter

ridgemod = Ridge()

model_cv = GridSearchCV(estimator=ridgemod, param_grid=parameters, scoring = 'r2', cv=folds, return_train_score=True, verbose = 1)
model_cv.fit(X_train,y_train) 

Fitting 5 folds for each of 45 candidates, totalling 225 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Ridge(),
             param_grid={'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.5,  3. ,
        3.5,  4. ,  4.5,  5. ,  6. ,  7. ,  8. ,  9. , 10. , 11. , 12. ,
       13. , 14. , 15. , 16. , 17. , 18. , 19. , 20. , 21. , 22. , 23. ,
       24. ])},
             return_train_score=True, scoring='r2', verbose=1)

In [54]:
model_cv.best_params_

{'alpha': 24.0}

In [55]:
## Ridge regression
ridge = Ridge(24.0)
ridge.fit(X_train, y_train)

## Prediction
ridge_pred = ridge.predict(X_test)
RMSE= np.sqrt(mean_squared_error(y_test, ridge_pred))
print(RMSE)

## Rmse is 73.21979111319159

73.21979111319159


In [56]:
## Min, mean, median, max for the ridge predictions
min = np.amin(ridge_pred)
mean = np.mean(ridge_pred)
median = np.median(ridge_pred)
max = np.amax(ridge_pred)

## 2 Decimal places
print("Min =", round(min, 2))
print("Mean =", round(mean, 2))
print("Median =", round(median, 2))
print("Max =", round(max, 2))

## Variables are CRSDepTime, CRSArrTime, RSElapsedTime, Distance
print(ridge.coef_)

## First 5 predictions for TotalDelay
print("First 5 ridge model predictions are",ridge_pred[:5])

Min = -32.82
Mean = 21.56
Median = 21.35
Max = 63.16
[ 0.01520132  0.00983827  0.14675631 -0.0170109 ]
First 5 ridge model predictions are [41.03114549  7.5113761  19.1485127  27.13670824 15.25323688]


### Lasso Regression

In [57]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [58]:
## Lasso regression
## Parameters are changed to allow for faster computation where it is likely that the best alpha for lasso would be a low one
## 4mins
parameters = {'alpha': ((np.arange(0.1,2.5,0.4)))}
lassomod = Lasso()
model_cv = GridSearchCV(estimator=lassomod, param_grid=parameters, scoring='r2',cv=folds,return_train_score=True,verbose = 1)
model_cv.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Lasso(),
             param_grid={'alpha': array([0.1, 0.5, 0.9, 1.3, 1.7, 2.1])},
             return_train_score=True, scoring='r2', verbose=1)

In [59]:
model_cv.best_params_

{'alpha': 0.1}

In [60]:
## Lasso regression
lasso = Lasso(0.1)
lasso.fit(X_train, y_train)

## Prediction
lasso_pred = lasso.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, lasso_pred))
print(RMSE)

## Lasso RMSE is 73.21979 207860282
## Ridge RMSE is 73.21979 111319159
## Linear model has RMSE of 73.21979 111318788
## Ridge rmse is lower
## Linear model's RMSE is lower than lasso/ridge in python


73.21979207860282


In [61]:
## Min, mean, median, max for the ridge predictions
min = np.amin(lasso_pred)
mean = np.mean(lasso_pred)
median = np.median(lasso_pred)
max = np.amax(lasso_pred)

## 2 Decimal places
print("Min =", round(min, 2))
print("Mean =", round(mean, 2))
print("Median =", round(median, 2))
print("Max =", round(max, 2))

## Variables are CRSDepTime, CRSArrTime, RSElapsedTime, Distance
print(lasso.coef_)

## First 5 predictions for TotalDelay
print("First 5 lasso model predictions are",lasso_pred[:5])

Min = -32.74
Mean = 21.56
Median = 21.35
Max = 63.08
[ 0.01520057  0.0098388   0.14638221 -0.01696526]
First 5 lasso model predictions are [41.03353203  7.51338374 19.16003051 27.12935137 15.24842162]


### Logistic Regression

In [62]:
DelaySetL = DelaySet

DelaySetL['TotalDelay'] = np.where( DelaySetL.TotalDelay > 10, 1, 0)

In [63]:
features = ['CRSDepTime','CRSArrTime','CRSElapsedTime','Distance']
X = DelaySetL[features].copy()
y = DelaySetL['TotalDelay'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [64]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [65]:
lr = LogisticRegression()

In [66]:
## Logistic Regression
lr.fit(X_train, y_train)

LogisticRegression()

In [67]:
## Coefficient and Intercept of Logistic Regression
print(lr.coef_)
print(lr.intercept_)

[[ 0.0005679   0.00034084  0.00334171 -0.00027721]]
[-2.16966987]


In [68]:
## Logistic Regression Predictions
Delay_Pred = lr.predict(X_test)

In [69]:
## Confusion Matrix
confusion_matrix(y_test, Delay_Pred)

array([[1390452,   46146],
       [ 705439,   40550]], dtype=int64)

In [70]:
## (1390452 + 40550) / 2182587
## = 0.655(3s.f) 
## 65.5% accurancy