# Import Dependencies

In [167]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [168]:
df = pd.read_csv("../Data/trial1.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,AIRLINE,AIRLINE_P,DEST_AIRPORT,DEST_AIRPORT_P,Delays,YEAR,MONTH,Flight_Status
0,EV,5,MEM,69,98.0,2017,4,Delayed
1,EV,5,GPT,40,68.0,2017,4,Delayed
2,EV,5,SAT,98,92.0,2017,4,Delayed
3,EV,5,GPT,40,53.0,2017,4,Delayed
4,NK,8,TPA,112,93.0,2017,4,Delayed


# Select features (columns)

In [169]:
df1 = df.drop(['AIRLINE', 'DEST_AIRPORT'], axis=1)
df1.head()

Unnamed: 0,AIRLINE_P,DEST_AIRPORT_P,Delays,YEAR,MONTH,Flight_Status
0,5,69,98.0,2017,4,Delayed
1,5,40,68.0,2017,4,Delayed
2,5,98,92.0,2017,4,Delayed
3,5,40,53.0,2017,4,Delayed
4,8,112,93.0,2017,4,Delayed


In [170]:
X = df1.iloc[:, :-1]  #independent columns
y = df1['Flight_Status'].values    #target column i.e price range
print(X.shape, y.shape)

(88399, 5) (88399,)


In [171]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
label_encoder.fit(y.reshape(-1,1))
encoded_y = label_encoder.transform(y)


# # Step 2: Convert encoded labels to one-hot-encoding
# y_categorical = to_categorical(encoded_y)

print(encoded_y, y)

[0 0 0 ... 0 0 1] ['Delayed' 'Delayed' 'Delayed' ... 'Delayed' 'Delayed' 'Normal']


# Create a Train Test Split

In [172]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,encoded_y,random_state=42,stratify=y)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [173]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the model

In [174]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
model = LogisticRegression()

In [175]:
model.fit(X_train, y_train)

LogisticRegression()

## Validate the model using the test data

In [176]:
predictions = model.predict(X_test)

In [177]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.913075612000181
Testing Data Score: 0.9149321266968325


In [179]:
print(f"First 10 Predictions:   {predictions[:10].tolist()}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0, 0, 0, 1, 1, 0, 0, 0, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 1, 0, 1, 1]


In [180]:
print(classification_report(y_test, predictions,
                target_names= df1['Flight_Status'].unique()))

              precision    recall  f1-score   support

     Delayed       0.95      0.93      0.94     15648
      Normal       0.84      0.87      0.86      6452

    accuracy                           0.91     22100
   macro avg       0.89      0.90      0.90     22100
weighted avg       0.92      0.91      0.92     22100



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [181]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}  # l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=RandomizedSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)

RandomizedSearchCV(cv=10, estimator=LogisticRegression(),
                   param_distributions={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                                        'penalty': ['l1', 'l2']})

In [182]:
C1 = [1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]

In [183]:
print("Best parameters: ",logreg_cv.best_params_)
print("Accuracy :",logreg_cv.best_score_)

Best parameters:  {'penalty': 'l2', 'C': 1000.0}
Accuracy : 0.913075603403574


In [184]:
log = LogisticRegression(penalty='l2', C=1000.0)
log.fit(X_train,y_train)

LogisticRegression(C=1000.0)

In [185]:
# Make predictions with the hypertuned model
predictions1 = log.predict(X_test)
print(classification_report(y_test, predictions1,
                target_names=df1['Flight_Status'].unique()))

              precision    recall  f1-score   support

     Delayed       0.95      0.93      0.94     15648
      Normal       0.84      0.87      0.86      6452

    accuracy                           0.91     22100
   macro avg       0.89      0.90      0.90     22100
weighted avg       0.92      0.91      0.92     22100



In [186]:
from sklearn.metrics import accuracy_score
print("Accuracy for LR on data: ",accuracy_score(y_test,predictions1))

Accuracy for LR on data:  0.9149321266968325


# Save the Model

In [187]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'LRmodel.sav'
joblib.dump(model, filename)

['LRmodel.sav']