# Import Dependencies

In [95]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [96]:
df = pd.read_csv("../Data/trial1.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,AIRLINE,AIRLINE_P,DEST_AIRPORT,DEST_AIRPORT_P,Delays,YEAR,MONTH,Flight_Status
0,EV,5,MEM,69,98.0,2017,4,Delayed
1,EV,5,GPT,40,68.0,2017,4,Delayed
2,EV,5,SAT,98,92.0,2017,4,Delayed
3,EV,5,GPT,40,53.0,2017,4,Delayed
4,NK,8,TPA,112,93.0,2017,4,Delayed


In [97]:
df1 = df.drop(['AIRLINE', 'DEST_AIRPORT'], axis=1)
df1.head()

Unnamed: 0,AIRLINE_P,DEST_AIRPORT_P,Delays,YEAR,MONTH,Flight_Status
0,5,69,98.0,2017,4,Delayed
1,5,40,68.0,2017,4,Delayed
2,5,98,92.0,2017,4,Delayed
3,5,40,53.0,2017,4,Delayed
4,8,112,93.0,2017,4,Delayed


# Select features (columns)

In [98]:
X = df1.iloc[:, :-1]  #independent columns
y = df1['Flight_Status'].values    #target column i.e price range
print(X.shape, y.shape)

(88399, 5) (88399,)


In [99]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
label_encoder.fit(y.reshape(-1,1))
encoded_y = label_encoder.transform(y)


# # Step 2: Convert encoded labels to one-hot-encoding
# y_categorical = to_categorical(encoded_y)

print(encoded_y.shape)

(88399,)


# Create a Train Test Split

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,encoded_y,random_state=42,stratify=encoded_y)

In [101]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [102]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
model = SVC(kernel='rbf')

In [103]:
model.fit(X_train, y_train)

SVC()

## Validate the model using the test data

In [104]:
predictions = model.predict(X_test)

In [105]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9140861853119957
Testing Data Score: 0.9156561085972851


In [106]:
print(f"First 10 Predictions:   {predictions[:10].tolist()}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0, 0, 0, 1, 1, 0, 0, 0, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 1, 0, 1, 1]


In [107]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,1,1
4,1,1
...,...,...
22095,0,0
22096,0,0
22097,0,0
22098,0,0


In [108]:
print(classification_report(y_test, predictions,
                target_names=df1['Flight_Status'].unique()))

              precision    recall  f1-score   support

     Delayed       0.94      0.94      0.94     15648
      Normal       0.85      0.87      0.86      6452

    accuracy                           0.92     22100
   macro avg       0.90      0.90      0.90     22100
weighted avg       0.92      0.92      0.92     22100



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [109]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
model1 = SVC(kernel='rbf')
param_grid = {'kernel':('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'), 'C':(1,0.25,0.5,0.75),'gamma': (1,2,3,'auto'),'decision_function_shape':('ovo','ovr'),'shrinking':(True,False)}
grid = RandomizedSearchCV(model1, param_grid, verbose=3)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] shrinking=False, kernel=linear, gamma=3, decision_function_shape=ovr, C=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [111]:
print(grid.best_params_)
print(grid.best_score_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [64]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                target_names=df1['Flight_Status'].unique()))

              precision    recall  f1-score   support

     Delayed       0.95      0.92      0.93     15648
      Normal       0.81      0.88      0.85      6452

    accuracy                           0.91     22100
   macro avg       0.88      0.90      0.89     22100
weighted avg       0.91      0.91      0.91     22100



In [66]:
model1= SVC(kernel='rbf', gamma=2, decision_function_shape='ovo', C=0.75, shrinking=False) 
model1.fit(X_train_scaled, y_train)

SVC(C=0.75, decision_function_shape='ovo', gamma=2, shrinking=False)

In [67]:
pred=model1.predict(X_test)

In [68]:
from sklearn.metrics import accuracy_score
print("Accuracy for SVM on data: ",accuracy_score(y_test,pred))

Accuracy for SVM on data:  0.7080542986425339


# Save the Model

In [69]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'SVMmodel.sav'
joblib.dump(model1, filename)

['SVMmodel.sav']