# Import Dependencies

In [97]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [98]:
df = pd.read_csv("../Data/trial1.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,AIRLINE,AIRLINE_P,DEST_AIRPORT,DEST_AIRPORT_P,Delays,YEAR,MONTH,Flight_Status
0,EV,5,MEM,69,98.0,2017,4,Delayed
1,EV,5,GPT,40,68.0,2017,4,Delayed
2,EV,5,SAT,98,92.0,2017,4,Delayed
3,EV,5,GPT,40,53.0,2017,4,Delayed
4,NK,8,TPA,112,93.0,2017,4,Delayed


In [117]:
df1 = df.drop(['AIRLINE', 'DEST_AIRPORT'], axis=1)
df1.head()

Unnamed: 0,AIRLINE_P,DEST_AIRPORT_P,Delays,YEAR,MONTH,Flight_Status
0,5,69,98.0,2017,4,Delayed
1,5,40,68.0,2017,4,Delayed
2,5,98,92.0,2017,4,Delayed
3,5,40,53.0,2017,4,Delayed
4,8,112,93.0,2017,4,Delayed


In [119]:
s = pd.read_csv('../data.csv')
s.head()

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,ORIGIN_CITY,DEST_AIRPORT,DEST_CITY,DEP_DELAY_TIME_MINUTES,CANCELLED,NO_FLIGHTS,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,MONTH,Flight_Status
0,EV,IAH,"Houston, TX",MEM,"Memphis, TN",58.0,0.0,1.0,0.0,0.0,40.0,0.0,0.0,2017,4,Delayed
1,EV,IAH,"Houston, TX",GPT,"Gulfport/Biloxi, MS",33.0,0.0,1.0,33.0,0.0,2.0,0.0,0.0,2017,4,Delayed
2,EV,IAH,"Houston, TX",SAT,"San Antonio, TX",45.0,0.0,1.0,45.0,0.0,2.0,0.0,0.0,2017,4,Delayed
3,EV,IAH,"Houston, TX",GPT,"Gulfport/Biloxi, MS",23.0,0.0,1.0,23.0,0.0,7.0,0.0,0.0,2017,4,Delayed
4,NK,IAH,"Houston, TX",TPA,"Tampa, FL",51.0,0.0,1.0,0.0,0.0,0.0,0.0,42.0,2017,4,Delayed


# Select features (columns)

In [100]:
X = df1.iloc[:, :-1]  #independent columns
y = df1['Flight_Status'].values    #target column i.e price range
print(X.shape, y.shape)

(88399, 5) (88399,)


In [101]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
label_encoder.fit(y.reshape(-1,1))
encoded_y = label_encoder.transform(y)


# # Step 2: Convert encoded labels to one-hot-encoding
# y_categorical = to_categorical(encoded_y)

print(encoded_y, y)

[0 0 0 ... 0 0 1] ['Delayed' 'Delayed' 'Delayed' ... 'Delayed' 'Delayed' 'Normal']


# Create a Train Test Split

In [102]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,encoded_y,random_state=42,stratify=encoded_y)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [103]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the model

In [104]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
model = RandomForestClassifier(random_state=42)

In [105]:
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

## Validate the model using the test data

In [106]:
predictions = model.predict(X_test)

In [107]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9937857282915278
Testing Data Score: 0.9161085972850679


In [108]:
print(f"First 10 Predictions:   {predictions[:10].tolist()}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0, 0, 0, 1, 1, 0, 0, 0, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 1, 0, 1, 1]


In [109]:
print(classification_report(y_test, predictions,
                target_names=df1['Flight_Status'].unique()))

              precision    recall  f1-score   support

     Delayed       0.93      0.96      0.94     15648
      Normal       0.89      0.81      0.85      6452

    accuracy                           0.92     22100
   macro avg       0.91      0.89      0.90     22100
weighted avg       0.92      0.92      0.91     22100



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [110]:
## Create the model ##
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
grid = RandomizedSearchCV(model, param_grid, verbose=3)

## Train the model ##
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=500, max_features=auto, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=500, max_features=auto, max_depth=8, criterion=gini, score=0.924, total=  10.0s
[CV] n_estimators=500, max_features=auto, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.9s remaining:    0.0s


[CV]  n_estimators=500, max_features=auto, max_depth=8, criterion=gini, score=0.925, total=   9.7s
[CV] n_estimators=500, max_features=auto, max_depth=8, criterion=gini 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.6s remaining:    0.0s


[CV]  n_estimators=500, max_features=auto, max_depth=8, criterion=gini, score=0.921, total=   9.6s
[CV] n_estimators=500, max_features=auto, max_depth=8, criterion=gini 
[CV]  n_estimators=500, max_features=auto, max_depth=8, criterion=gini, score=0.924, total=   9.8s
[CV] n_estimators=500, max_features=auto, max_depth=8, criterion=gini 
[CV]  n_estimators=500, max_features=auto, max_depth=8, criterion=gini, score=0.922, total=  10.0s
[CV] n_estimators=500, max_features=auto, max_depth=4, criterion=entropy 
[CV]  n_estimators=500, max_features=auto, max_depth=4, criterion=entropy, score=0.923, total=   6.6s
[CV] n_estimators=500, max_features=auto, max_depth=4, criterion=entropy 
[CV]  n_estimators=500, max_features=auto, max_depth=4, criterion=entropy, score=0.925, total=   6.6s
[CV] n_estimators=500, max_features=auto, max_depth=4, criterion=entropy 
[CV]  n_estimators=500, max_features=auto, max_depth=4, criterion=entropy, score=0.921, total=   6.6s
[CV] n_estimators=500, max_featur

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  4.7min finished


RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [4, 5, 6, 7, 8],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': [200, 500]},
                   verbose=3)

In [111]:
print("Best params :", grid.best_params_)
print("Best score: ", grid.best_score_)

Best params : {'n_estimators': 500, 'max_features': 'auto', 'max_depth': 8, 'criterion': 'gini'}
Best score:  0.923241663905231


In [112]:
RFmodel = RandomForestClassifier(random_state=42, n_estimators=500, max_features='auto', max_depth=8, criterion='gini')
RFmodel.fit(X_train,y_train)

RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42)

In [113]:
# Make predictions with the hypertuned model
predictions = RFmodel.predict(X_test)
print(classification_report(y_test, predictions,
                target_names=df1['Flight_Status'].unique()))

              precision    recall  f1-score   support

     Delayed       0.92      0.98      0.95     15648
      Normal       0.94      0.80      0.86      6452

    accuracy                           0.92     22100
   macro avg       0.93      0.89      0.90     22100
weighted avg       0.93      0.92      0.92     22100



In [114]:
pred=RFmodel.predict(X_test)
print(f"First 10 Predictions:   {pred[:10].tolist()}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0, 0, 0, 1, 1, 0, 0, 0, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 1, 0, 1, 1]


In [115]:
from sklearn.metrics import accuracy_score
print("Accuracy for Random Forest on data: ",accuracy_score(y_test,pred))

Accuracy for Random Forest on data:  0.924841628959276


# Save the Model

In [116]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'RFmodel.sav'
joblib.dump(RFmodel, filename)

['RFmodel.sav']