In [2]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import scale
from sklearn import metrics

#Read data from the spreadsheet
data = pd.read_csv("cleaned_mlready_combinedData1.csv")

In [3]:
# check out fire and no fire ratio
data["Fire"].value_counts()
data.dropna()
data.shape

(7271, 16)

In [4]:
## Set up the label variable as y
y = data['Fire']

## Dropping feature 'label'
X = data.drop(columns = 'Fire')

In [5]:
# Scale the set as part of data cleaning process
X_scaled = scale(X)

In [6]:
# Split the set into randomized training and test feature/label sets. Use test set size of 60%, training set size of 40%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.6, train_size = 0.4 ,random_state = 4)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2908, 15)
(4363, 15)
(2908,)
(4363,)


In [7]:
# Test out using RBF Kernel, as it's not very linear data

# Create Model
rbfKernelModel = SVC(C=10, gamma=0.01, kernel='rbf')
# Train Model
rbfKernelModel.fit(X_train, y_train)
# Create label prediction
labelPrediction = rbfKernelModel.predict(X_test)

#Print accuracy and confusion matrix
print("Accuracy RBF:", metrics.accuracy_score(y_true=y_test, y_pred=labelPrediction), "\n")

Accuracy RBF: 0.9841851936740774 



In [8]:
#baseline predictions, amd confusion matrix of the baseline model
prediction_base = rbfKernelModel.predict(X_test)
confusion_matrix(y_test, prediction_base, labels=[0,1])

#Print accuracy and confusion matrix
print("Accuracy RBF No Oversampling:", metrics.accuracy_score(y_true=y_test, y_pred=labelPrediction), "\n")

#Get report on base model
print(classification_report(y_test,prediction_base))

Accuracy RBF No Oversampling: 0.9841851936740774 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4294
           1       0.00      0.00      0.00        69

    accuracy                           0.98      4363
   macro avg       0.49      0.50      0.50      4363
weighted avg       0.97      0.98      0.98      4363



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_resample(X_scaled, y)

# Split the set into randomized training and test feature/label sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, train_size = 0.7 ,random_state = 10)

# Create Model
rbfKernelResampledModel = SVC(kernel='rbf')
# Train Model
rbfKernelResampledModel.fit(X_train, y_train)
# Create label prediction
labelPredictionResampled = rbfKernelResampledModel.predict(X_test)

In [None]:
#Print accuracy and confusion matrix
print("Accuracy RBF Resampled:", metrics.accuracy_score(y_true=y_test, y_pred=labelPredictionResampled), "\n")

#baseline predictions, amd confusion matrix of the baseline model
prediction_base_rbf = rbfKernelResampledModel.predict(X_test)
print(confusion_matrix(y_test, prediction_base_rbf, labels=[0,1]))

#Get report on base model
print(classification_report(y_test,prediction_base_rbf))

In [None]:
X_resampled, y_resampled = ADASYN().fit_resample(X_scaled, y)

# Split the set into randomized training and test feature/label sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, train_size = 0.7 ,random_state = 10)

# Create Model
rbfKernelResampledModel = SVC(kernel='rbf')
# Train Model
rbfKernelResampledModel.fit(X_train, y_train)
# Create label prediction
labelPredictionResampled = rbfKernelResampledModel.predict(X_test)

In [None]:
#Print accuracy and confusion matrix
print("Accuracy RBF Resampled:", metrics.accuracy_score(y_true=y_test, y_pred=labelPredictionResampled), "\n")

#baseline predictions, amd confusion matrix of the baseline model
prediction_base_rbf = rbfKernelResampledModel.predict(X_test)
print(confusion_matrix(y_test, prediction_base_rbf, labels=[0,1]))

#Get report on base model
print(classification_report(y_test,prediction_base_rbf))

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#Hyper parameter tuning for RBF Kernel SVM

# Creating k fold object- 5 splits in it
foldsGridSearch = KFold(n_splits = 5, shuffle = True, random_state = 10)

# Set up the range of hyperparameters
# Set the parameters by cross-validation
paramsToSearch = [ {'gamma': [1e-2, 1e-3, 1e-4],
                     'C': [5,10]}]


# Set up SVC model for search/tuning
modelSVC = SVC(kernel="rbf")

# Set up the GridSearch
modelCrossValidation = GridSearchCV(estimator = modelSVC,
                        param_grid = paramsToSearch,
                        scoring= 'accuracy',
                        cv = foldsGridSearch,
                        verbose = 1,
                        return_train_score=True)

# Train the model
modelCrossValidation.fit(X_train, y_train)

In [None]:
#Getting the most optimal accuracy
bestHyperparameterScore = modelCrossValidation.best_score_
hyperParametersOptimal = modelCrossValidation.best_params_

print("The most optimal accuracy for the hyperparameters is {0}  {1}".format(bestHyperparameterScore, hyperParametersOptimal))

In [None]:
# Create Model with optimal parameters
rbfKernelResampledModel = SVC(C=10, gamma=0.01, kernel='rbf')
# Train Model
rbfKernelResampledModel.fit(X_train, y_train)
# Create label prediction
labelPredictionResampled = rbfKernelResampledModel.predict(X_test)

#Print accuracy and confusion matrix
print("Accuracy RBF Resampled:", metrics.accuracy_score(y_true=y_test, y_pred=labelPredictionResampled), "\n")

#baseline predictions, amd confusion matrix of the baseline model
prediction_base_rbf = rbfKernelResampledModel.predict(X_test)
confusion_matrix(y_test, prediction_base_rbf, labels=[0,1])

#Get report on base model
print(classification_report(y_test,prediction_base_rbf))