# Classification Models
Comparing the goodness of the following Classification models
* Logistic Regression
* Decision Trees
* Random Forest
* XGBoost
* Extra Trees 
* SVM

## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score

## Utilities

In [None]:
dropColumns = ['Year', 'Quarter', 'Month', 'DayofMonth', 'Origin', 'Dest', 'DepHour', 'ArrHour', 'FlightDate']

features = ['OriginAirportID', 'DestAirportID',
            'DepTime', 'DepDelayMinutes', 'WindSpeedKmph_Origin', 
            'WindDirDegree_Origin', 'WeatherCode_Origin',
            'precipMM_Origin', 'Visibility_Origin', 'Pressure_Origin',
            'Cloudcover_Origin', 'DewPointF_Origin', 'WindGustKmph_Origin',
            'tempF_Origin', 'WindChillF_Origin', 'Humidity_Origin',
            'WindSpeedKmph_Dest', 'WindDirDegree_Dest', 'WeatherCode_Dest',
            'precipMM_Dest', 'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest',
            'DewPointF_Dest', 'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest',
            'Humidity_Dest']

target = 'ArrDel15'

len(features)

28

In [None]:
def predict(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  print(classification_report(y_test, predictions))
  print("Accuracy Score: ", accuracy_score(y_test, predictions))
  print("Balanced Accuracy Score: ", balanced_accuracy_score(y_test, predictions))

## Dataset

In [None]:
data = pd.read_csv("drive/MyDrive/Data/dataset.csv")
data = data.drop(dropColumns, axis=1)
data = data.astype({"ArrDel15": np.int64})

## Separating X and y

In [None]:
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
scaler.fit(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Models

### Logistic Regression

In [None]:
predict(LogisticRegression(), X_train_scaled, X_test_scaled, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292422
           1       0.89      0.69      0.77     77866

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.91      0.92      0.91    370288

Accuracy Score:  0.9161787581558138
Balanced Accuracy Score:  0.831485263842979


### Decision Trees

In [None]:
predict(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92    292377
           1       0.68      0.71      0.70     77911

    accuracy                           0.87    370288
   macro avg       0.80      0.81      0.81    370288
weighted avg       0.87      0.87      0.87    370288

Accuracy Score:  0.869701421596163
Balanced Accuracy Score:  0.810183580286804


### Random Forest

In [None]:
predict(RandomForestClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292377
           1       0.89      0.70      0.78     77911

    accuracy                           0.92    370288
   macro avg       0.91      0.84      0.87    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.9182339152227456
Balanced Accuracy Score:  0.839000226365511


### XGBoost

In [None]:
predict(XGBClassifier(), pd.DataFrame(X_train, columns=features), pd.DataFrame(X_test, columns=features), y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292377
           1       0.90      0.68      0.78     77911

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.917280603206153
Balanced Accuracy Score:  0.8311564830055568


### Extra Trees

In [None]:
predict(ExtraTreesClassifier(), X_train_scaled, X_test_scaled, y_train, y_test)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94    292377
           1       0.87      0.67      0.76     77911

    accuracy                           0.91    370288
   macro avg       0.90      0.82      0.85    370288
weighted avg       0.91      0.91      0.91    370288

Accuracy Score:  0.9103076524218986
Balanced Accuracy Score:  0.8220946897652577


### SVM

In [None]:
predict(LinearSVC(), X_train_scaled, X_test_scaled, y_train, y_test)



              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292377
           1       0.90      0.68      0.77     77911

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.9165595428423281
Balanced Accuracy Score:  0.8301820603405525
