In [32]:
#Load Libraries
import pandas as pd
import numpy as np
import sqlite3 as sqlite3
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [2]:
# Load the data into a DataFrame

con = sqlite3.connect("data/wildfire.sqlite")
fires = pd.read_sql_query(
    "select NWCG_REPORTING_AGENCY,CONT_DATE - DISCOVERY_DATE as CONT_TIME, CONT_DOY, \
    LONGITUDE,LATITUDE, SOURCE_SYSTEM_TYPE,DISCOVERY_DATE, FIRE_YEAR,\
    DISCOVERY_DOY,STAT_CAUSE_DESCR,FIRE_SIZE from fires", con)
con.close()

In [3]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 11 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   NWCG_REPORTING_AGENCY  object 
 1   CONT_TIME              float64
 2   CONT_DOY               float64
 3   LONGITUDE              float64
 4   LATITUDE               float64
 5   SOURCE_SYSTEM_TYPE     object 
 6   DISCOVERY_DATE         float64
 7   FIRE_YEAR              int64  
 8   DISCOVERY_DOY          int64  
 9   STAT_CAUSE_DESCR       object 
 10  FIRE_SIZE              float64
dtypes: float64(6), int64(2), object(3)
memory usage: 157.8+ MB


In [4]:
fires = fires.drop_duplicates()
fires.shape

(1874278, 11)

In [5]:
fires = fires.dropna()
fires.shape

(985122, 11)

In [6]:
fires1 = fires[(fires["STAT_CAUSE_DESCR"] != "Missing/Undefined") & (fires["STAT_CAUSE_DESCR"] != "Miscellaneous")]
fires1.shape

(751376, 11)

In [7]:
# fires1["combined_date_dis"] = fires1["FIRE_YEAR"]*1000 + fires1["DISCOVERY_DOY"]
# fires1["combined_date_dis"] = pd.to_datetime(fires1["combined_date_dis"], format = "%Y%j")
# fires1["combined_date_dis"] = pd.to_datetime(fires1["combined_date_dis"])
# fires1["combined_date_con"] = fires1["FIRE_YEAR"]*1000 + fires1["CONT_DOY"]
# fires1["combined_date_con"] = pd.to_datetime(fires1["combined_date_con"], format = "%Y%j",errors="ignore")
# fires1["combined_date_con"] = pd.to_datetime(fires1["combined_date_con"],errors="coerce")
# fires1["week_day"] = fires1.combined_date_dis.dt.weekday
# fires1["month"] = fires1.combined_date_dis.dt.month

In [8]:
# fires1=fires1.drop(['CONT_DOY', 'FIRE_YEAR', 'combined_date_dis', 'combined_date_con'],axis=1)
fires1=fires1.drop(['CONT_DOY', 'FIRE_YEAR'],axis=1)
fires1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 751376 entries, 1 to 1880458
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   NWCG_REPORTING_AGENCY  751376 non-null  object 
 1   CONT_TIME              751376 non-null  float64
 2   LONGITUDE              751376 non-null  float64
 3   LATITUDE               751376 non-null  float64
 4   SOURCE_SYSTEM_TYPE     751376 non-null  object 
 5   DISCOVERY_DATE         751376 non-null  float64
 6   DISCOVERY_DOY          751376 non-null  int64  
 7   STAT_CAUSE_DESCR       751376 non-null  object 
 8   FIRE_SIZE              751376 non-null  float64
dtypes: float64(5), int64(1), object(3)
memory usage: 57.3+ MB


In [9]:
xFires=fires1.loc[:,fires1.columns != 'STAT_CAUSE_DESCR']
yFires=fires1['STAT_CAUSE_DESCR']
xFires=pd.get_dummies(xFires, columns=['NWCG_REPORTING_AGENCY', 'SOURCE_SYSTEM_TYPE'])

#Training and test set split
xTrain,xTest,yTrain,yTest=train_test_split(xFires,yFires,\
                           test_size=0.1,random_state =441)

xTrain.shape
# yTrain.shape

(676238, 19)

In [10]:
counts = yTrain.value_counts()
counts

Lightning         204407
Debris Burning    166407
Arson             130911
Campfire           49723
Equipment Use      49171
Children           26106
Smoking            22365
Fireworks           9402
Powerline           7560
Railroad            7459
Structure           2727
Name: STAT_CAUSE_DESCR, dtype: int64

In [11]:
from imblearn.over_sampling import RandomOverSampler

def count_under_10000(colname):
    if counts[colname] < 10000:
        return 10000
    return counts[colname]

#Perform undersampling
OverSampleRatio = {
    'Lightning' : count_under_10000('Lightning'), 'Debris Burning' : count_under_10000('Debris Burning'), 'Campfire' : count_under_10000('Campfire'), 
    'Equipment Use' : count_under_10000('Equipment Use'),    
    'Arson' : count_under_10000('Arson'), 'Children' : count_under_10000('Children'), 'Railroad' : count_under_10000('Railroad'),
    'Smoking' : count_under_10000('Smoking'), 'Powerline' : count_under_10000('Powerline'),
    'Fireworks' : count_under_10000('Fireworks'), 'Structure' : count_under_10000('Structure')
}

newSampStrat=RandomOverSampler(sampling_strategy=OverSampleRatio,random_state=441) #Goal balance all classes
xTrain,yTrain=newSampStrat.fit_resample(xTrain,yTrain) #perform the balancing newX and newY are balanced X and y

In [12]:
counts = yTrain.value_counts()
counts

Lightning         204407
Debris Burning    166407
Arson             130911
Campfire           49723
Equipment Use      49171
Children           26106
Smoking            22365
Structure          10000
Powerline          10000
Railroad           10000
Fireworks          10000
Name: STAT_CAUSE_DESCR, dtype: int64

In [13]:
from imblearn.under_sampling import RandomUnderSampler


newSampStrat=RandomUnderSampler(sampling_strategy='not minority',random_state=441) #Goal balance all classes
xTrain,yTrain=newSampStrat.fit_resample(xTrain,yTrain) #perform the balancing newX and newY are balanced X and y

In [14]:
yTrain.value_counts()

Structure         10000
Equipment Use     10000
Lightning         10000
Children          10000
Railroad          10000
Debris Burning    10000
Arson             10000
Smoking           10000
Powerline         10000
Campfire          10000
Fireworks         10000
Name: STAT_CAUSE_DESCR, dtype: int64

In [15]:
# RBF kernel with covariate scaling
model_rbf = Pipeline(
    steps=[("scaler", StandardScaler()), 
           ("model", svm.SVC(kernel="rbf"))]
)
model_to_set = OneVsRestClassifier(model_rbf)

# tuning parameter grid
# model__xyz specifies that parameter xyz is a parameter to model
param_grid = {
    "model__C": [.01, 1, 100],
    "model__class_weight": [None, "balanced"],
    "model__gamma": ["scale", "auto"]
}

# crossvalidation folds
cv = KFold(
    n_splits=5,  # number of folds
    shuffle=True # protects against data being ordered, e.g., all successes first
)

cv_rbf_onevall = GridSearchCV(
    estimator = model_rbf,
    param_grid = param_grid,
    cv = cv
)

In [16]:
%%time
cv_rbf_onevall.fit(X=xTrain, y=yTrain)

Wall time: 15h 4min 55s


In [18]:
cv_rbf_onevall.cv_results_

{'mean_fit_time': array([592.73853679, 595.93523722, 665.49140964, 671.71331196,
        507.58613315, 535.73193483, 570.97944741, 578.73534555,
        756.79338703, 807.1588387 , 804.34122252, 873.67104511]),
 'std_fit_time': array([ 0.63446882,  1.12644403,  3.24754835,  4.88556244,  9.52419806,
        18.82196498, 14.8581121 ,  6.62776494,  4.89390872, 68.42738092,
        10.62435302, 43.1783417 ]),
 'mean_score_time': array([222.19781899, 222.40423155, 222.22548881, 222.63417449,
        221.10265083, 225.0935884 , 216.87127857, 213.9867475 ,
        207.81057768, 211.19906588, 206.57024803, 220.17426529]),
 'std_score_time': array([ 0.13968939,  0.14347161,  0.18562101,  0.45195835, 10.21550987,
        11.80446931,  5.57870563,  0.35293487,  3.22504238,  6.86236595,
         0.61013407,  6.53181787]),
 'param_model__C': masked_array(data=[0.01, 0.01, 0.01, 0.01, 1, 1, 1, 1, 100, 100, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
       

In [19]:
final_model = cv_rbf_onevall.best_estimator_

In [22]:
yPred = final_model.predict(xTest) 

In [26]:
accuracy = accuracy_score(yTest, yPred)
print("Test Accuracy: %.2f%%" % (accuracy * 100.0))

Test Accuracy: 42.24%


In [29]:
print("The best training accuracy score is ", cv_rbf_onevall.best_score_ * 100, "%")

The best training accuracy score is  37.82454545454546 %


In [34]:
%%time
filename='SVM_model_proposal'
pickle.dump(cv_rbf_onevall, open(filename, 'wb')) #Saving the model

Wall time: 314 ms


In [42]:
pickle.dump(yPred, open('predictions', 'wb'))

In [44]:
pickle.load(open('predictions', 'rb'))

array(['Structure', 'Campfire', 'Fireworks', ..., 'Railroad', 'Lightning',
       'Structure'], dtype=object)

In [41]:
yPred

array(['Structure', 'Campfire', 'Fireworks', ..., 'Railroad', 'Lightning',
       'Structure'], dtype=object)