# Module 2: Classification

In [2]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


## Imports

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

## Utilities

In [4]:
dropColumns = ['Unnamed: 0', 'Year', 'Quarter', 'Month', 'DayofMonth', 'OriginAirportID', 'Origin', 'DestAirportID', 'Dest', 'DepHour', 'ArrHour', 'FlightDate']

features = ['DepTime', 'DepDelayMinutes', 'WindSpeedKmph_Origin', 'WindDirDegree_Origin', 'WeatherCode_Origin', 'precipMM_Origin', 'Visibility_Origin', 'Pressure_Origin', 'Cloudcover_Origin', 'DewPointF_Origin', 'WindGustKmph_Origin', 'tempF_Origin', 'WindChillF_Origin', 'Humidity_Origin', 'WindSpeedKmph_Dest', 'WindDirDegree_Dest', 'WeatherCode_Dest', 'precipMM_Dest', 'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest', 'DewPointF_Dest', 'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest', 'Humidity_Dest']

target = 'ArrDel15'

In [5]:
def predict(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))
    print("Accuracy Score: ", accuracy_score(y_test, predictions))

## Dataset

In [6]:
data = pd.read_csv("drive/MyDrive/Data/dataset.csv")
data = data.drop(dropColumns, axis=1)
data = data.astype({"ArrDel15": np.int64, "DepDel15": np.int64})
print(data.shape, "\n")

(1851436, 32) 



In [7]:
data.head()

Unnamed: 0,CRSDepTime,DepTime,DepDelayMinutes,DepDel15,CRSArrTime,ArrTime,ArrDelayMinutes,ArrDel15,WindSpeedKmph_Origin,WindDirDegree_Origin,WeatherCode_Origin,precipMM_Origin,Visibility_Origin,Pressure_Origin,Cloudcover_Origin,DewPointF_Origin,WindGustKmph_Origin,tempF_Origin,WindChillF_Origin,Humidity_Origin,WindSpeedKmph_Dest,WindDirDegree_Dest,WeatherCode_Dest,precipMM_Dest,Visibility_Dest,Pressure_Dest,Cloudcover_Dest,DewPointF_Dest,WindGustKmph_Dest,tempF_Dest,WindChillF_Dest,Humidity_Dest
0,745,741,0.0,0,1602,1610,8.0,0,7,106,113,0.0,10,1030,0,24,8,34,29,67,21,254,113,0.0,10,1016,0,33,24,42,33,74
1,1330,1408,38.0,1,1603,1628,25.0,1,8,227,122,0.1,8,1019,79,72,10,79,78,83,21,254,113,0.0,10,1016,0,33,24,42,33,74
2,1445,1445,0.0,0,1720,1659,0.0,0,8,227,122,0.1,8,1019,79,72,10,79,78,83,21,254,113,0.0,10,1016,0,33,24,42,33,74
3,855,853,0.0,0,1712,1650,0.0,0,12,29,113,0.0,10,1020,1,30,15,50,47,47,21,254,113,0.0,10,1016,0,33,24,42,33,74
4,840,842,2.0,0,1708,1626,0.0,0,12,29,113,0.0,10,1020,1,30,15,50,47,47,21,254,113,0.0,10,1016,0,33,24,42,33,74


### Balance of dataset

In [8]:
print("ArrDel15")
print("0:", round(len(data[data["ArrDel15"]==0]["ArrDel15"])/len(data) * 100, 2), "%")
print("1:", round(len(data[data["ArrDel15"]==1]["ArrDel15"])/len(data) * 100, 2), "%")

ArrDel15
0: 79.04 %
1: 20.96 %


## Models
1. Logistic Regression 
2. Decision trees
3. SVM 
4. Extra Trees Classifier
5. XGBoost

### Separating X and y

In [9]:
X = data[features]
y = data[target].astype({'ArrDel15': np.int64})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scalerX = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scalerX.transform(X_train)
X_test_scaled = scalerX.transform(X_test)

### 1. Logistic Regression

In [10]:
predict(LogisticRegression(), X_train_scaled, X_test_scaled, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292803
           1       0.89      0.68      0.77     77485

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.91      0.92      0.91    370288

Accuracy Score:  0.91602482392084


### 2. Decision Tree Classification

In [11]:
predict(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92    292803
           1       0.68      0.70      0.69     77485

    accuracy                           0.87    370288
   macro avg       0.80      0.81      0.80    370288
weighted avg       0.87      0.87      0.87    370288

Accuracy Score:  0.867603054919414


### 3. SVM

In [12]:
predict(LinearSVC(), X_train_scaled, X_test_scaled, y_train, y_test)



              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292803
           1       0.89      0.68      0.77     77485

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.91      0.92      0.91    370288

Accuracy Score:  0.9160869377349522


### 4. Extra Trees Classsifier

In [13]:
predict(ExtraTreesClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.97      0.95    292803
           1       0.88      0.68      0.76     77485

    accuracy                           0.91    370288
   macro avg       0.90      0.83      0.85    370288
weighted avg       0.91      0.91      0.91    370288

Accuracy Score:  0.9123763124918982


### 5. Gradient Boosting Classifier

In [14]:
predict(GradientBoostingClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292803
           1       0.89      0.68      0.77     77485

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.9166000518515318


## 6. Random Forest Classifier

In [15]:
predict(RandomForestClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292803
           1       0.88      0.70      0.78     77485

    accuracy                           0.92    370288
   macro avg       0.90      0.84      0.86    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.9170240461478633
