In [27]:
# %pip install python-dotenv
# %pip install seaborn
# %pip install tensorflow_data_validation


In [28]:
import os
import pandas as pd
import geopandas as gpd
import pygeos as pg
import numpy as np
# import tensorflow as tf
# import tensorflow_data_validation as tfdv
import sklearn as sk
import scipy as sp
import seaborn as sns
# from datetime import datetime
# from dotenv import load_dotenv
from IPython.display import clear_output
from matplotlib import pyplot as plt
from shapely import wkt


In [29]:
# The following lines adjust the granularity of reporting.
#pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
os.chdir('F:\\Uni Files\\4710\\4710 Project\\MLweatherForestFire')


In [30]:
def getGPDfromPD(df: pd.DataFrame, geomCol: str, crs: str = "EPSG:3978") -> gpd.GeoDataFrame:
    """
    Convert a pandas dataframe to a geopandas dataframe
    :param df: pandas dataframe
    :param geomCol: name of the geometry column
    :param crs: coordinate reference system
    :return: geopandas dataframe
    """
    if 'geom' in df.columns:
        df.rename(columns={'geom': 'geometry'}, inplace=True)

    df[geomCol] = df[geomCol].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df, geometry=geomCol, crs=crs)
    return gdf


In [31]:
fireWeatherTable = "Data/FinalFeature.csv"
dfFireWeather = pd.read_csv(fireWeatherTable)


In [32]:
dfEval = dfFireWeather


In [33]:
# Store our random selection, run once
# randomTrain = "RandomTrain"
# dfTrain.to_sql(randomTrain, db_push_con, if_exists='replace', index=False)

# randomTest = "RandomTest"
# dfTest.to_sql(randomTest, db_push_con, if_exists='replace', index=False)

# randomValidate = "RandomValidate"
# dfValidate.to_sql(randomValidate, db_push_con, if_exists='replace', index=False)


In [34]:
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
# from sklearn.metrics import classification_report
# from sklearn.inspection import DecisionBoundaryDisplay


In [35]:
dfEval.columns


Index(['ENTRYID', 'FIRE_ID', 'FIRENAME', 'YEAR', 'MONTH', 'DAY', 'REP_DATE',
       'SIZE_HA', 'SIZE_HA_BIN', 'GEOM', 'ELEVATIONM', 'DIST_TO_WATER',
       'CLIMATEID', 'PROVINCECODE', 'DAYW', 'MAXTEMP', 'MEANHUMIDITY',
       'MEANWINDSPEED', 'MAXWINDSPEED', 'TOTALPRECIP', 'RAIN', 'LONG', 'LAT',
       'LONGBIN', 'LATBIN'],
      dtype='object')

In [36]:
dfTrim = dfEval.copy(deep=True)

# edit this
dfTrim = dfTrim.drop(columns={'ENTRYID', 'FIRE_ID', 'FIRENAME', 'GEOM', 'CLIMATEID', 'PROVINCECODE',
                              'TOTALPRECIP', 'LONG', 'LAT', 'REP_DATE', 'SIZE_HA'})


In [37]:
# replace size_ha_bin 0 1 2 3 with L M H E

dfTrim['SIZE_HA_BIN'] = dfTrim['SIZE_HA_BIN'].astype(str).replace('0', 'L')
dfTrim['SIZE_HA_BIN'] = dfTrim['SIZE_HA_BIN'].astype(str).replace('1', 'M')
dfTrim['SIZE_HA_BIN'] = dfTrim['SIZE_HA_BIN'].astype(str).replace('2', 'H')
dfTrim['SIZE_HA_BIN'] = dfTrim['SIZE_HA_BIN'].astype(str).replace('3', 'E')


In [38]:
# randomly select 6 years from 2010-2019 for training
dfTrain = dfTrim[dfTrim['YEAR'].isin(
    [2010, 2011, 2012, 2013, 2014, 2015, 2016])].drop(columns={'SIZE_HA_BIN'})
dfTest = dfTrim[dfTrim['YEAR'].isin([2017, 2018, 2019, 2020])].drop(
    columns={'SIZE_HA_BIN'})


In [39]:
def scaleData(df: pd.DataFrame) -> pd.DataFrame:
    # regularize y values using z score
    df = (df - df.mean()) / df.std()
    # set max value to 3 zscore
    df[df > 3] = 3
    # set min value to -3 zscore
    df[df < -3] = -3

    # shift the wole train set to be positive
    df = df + 3

    # return dataframe list of means and stds
    return df, df.mean(), df.std()


In [40]:
dfTrainScaled, dfMeans, dfStdevs = scaleData(dfTrain.copy(deep=True))
dfTrainScaled['YEAR'] = dfTrain['YEAR']
dfTrainScaled['SIZE_HA_BIN'] = dfTrim[dfTrim['YEAR'].isin(
    [2010, 2011, 2012, 2013, 2014, 2015, 2016])]['SIZE_HA_BIN']


In [41]:

print(dfTrainScaled.count())
print(dfTrainScaled.isna().sum().sum())
print(dfTrainScaled.dtypes)


YEAR             357
MONTH            357
DAY              357
ELEVATIONM       357
DIST_TO_WATER    357
DAYW             357
MAXTEMP          357
MEANHUMIDITY     357
MEANWINDSPEED    357
MAXWINDSPEED     357
RAIN             357
LONGBIN          357
LATBIN           357
SIZE_HA_BIN      357
dtype: int64
0
YEAR               int64
MONTH            float64
DAY              float64
ELEVATIONM       float64
DIST_TO_WATER    float64
DAYW             float64
MAXTEMP          float64
MEANHUMIDITY     float64
MEANWINDSPEED    float64
MAXWINDSPEED     float64
RAIN             float64
LONGBIN          float64
LATBIN           float64
SIZE_HA_BIN       object
dtype: object


In [42]:
print(dfTrainScaled.describe())


        YEAR  MONTH   DAY  ELEVATIONM  DIST_TO_WATER  DAYW  MAXTEMP  \
count  357.0  357.0 357.0       357.0          357.0 357.0    357.0   
mean  2012.9    3.0   3.0         3.0            3.0   3.0      3.0   
std      2.1    1.0   1.0         1.0            0.9   1.0      0.9   
min   2010.0    0.4   1.4         2.0            2.1   1.4      0.0   
25%   2011.0    2.5   2.0         2.4            2.3   2.0      2.6   
50%   2013.0    3.5   3.1         2.5            2.6   3.1      3.2   
75%   2015.0    3.5   3.9         2.8            3.3   3.9      3.6   
max   2016.0    6.0   4.5         6.0            6.0   4.5      5.1   

       MEANHUMIDITY  MEANWINDSPEED  MAXWINDSPEED  RAIN  LONGBIN  LATBIN  
count         357.0          357.0         357.0 357.0    357.0   357.0  
mean            3.0            3.0           3.0   3.0      3.0     3.0  
std             1.0            0.9           1.0   1.0      1.0     1.0  
min             0.2            1.4           1.2   2.5      0.5 

In [43]:
C = 1  # SVM regularization parameter

model1 = svm.SVC(kernel="linear", C=C, decision_function_shape='ovo')
model2 = svm.LinearSVC(C=C, max_iter=10000)
model3 = svm.SVC(kernel="rbf", gamma=0.7, C=C, decision_function_shape='ovo')
model4 = svm.SVC(kernel="poly", degree=3, gamma="auto",
                 C=C, decision_function_shape='ovo')
model5 = svm.SVC(kernel="sigmoid", gamma="auto", C=C, decision_function_shape='ovo')

# random forest
model6 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model7 = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
model8 = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=0)
model9 = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=0)

dfFeatures = dfTrainScaled.drop(['SIZE_HA_BIN', 'YEAR'], axis=1)
dfLabel = dfTrainScaled['SIZE_HA_BIN']

model1 = model1.fit(dfFeatures, dfLabel)
model2 = model2.fit(dfFeatures, dfLabel)
model3 = model3.fit(dfFeatures, dfLabel)
model4 = model4.fit(dfFeatures, dfLabel)
model5 = model5.fit(dfFeatures, dfLabel)
model6 = model6.fit(dfFeatures, dfLabel)
model7 = model7.fit(dfFeatures, dfLabel)
model8 = model8.fit(dfFeatures, dfLabel)
model9 = model9.fit(dfFeatures, dfLabel)


In [44]:
# scale test data using the same means and stds
dfTestScaled = (dfTest - dfMeans) / dfStdevs

# shift the wole test set to be positive
dfTestScaled = dfTestScaled + 3

# add year column back
dfTestScaled['YEAR'] = dfTest['YEAR']
dfTestScaled['SIZE_HA_BIN'] = dfTrim[dfTrim['YEAR'].isin(
    [2017, 2018, 2019, 2020])]['SIZE_HA_BIN']


In [45]:
print(dfTestScaled.head())

     YEAR  MONTH  DAY  ELEVATIONM  DIST_TO_WATER  DAYW  MAXTEMP  MEANHUMIDITY  \
737  2017    8.1  5.0       378.6         6284.5   4.0     28.3          61.3   
738  2017    7.1 21.0      1683.8          625.3  20.0     19.9          54.8   
739  2017    8.1  5.0       450.5          870.8   4.0     26.0          74.2   
740  2017    8.1 12.0       454.6         7020.1  11.0     28.8          72.9   
741  2017    7.1 27.0       409.0         1151.0  26.0     28.6          70.5   

     MEANWINDSPEED  MAXWINDSPEED  RAIN  LONGBIN  LATBIN SIZE_HA_BIN  
737            7.1          11.2  -0.0    470.0   486.8           M  
738            7.1          10.1  -0.0    352.0   543.5           H  
739            3.5           8.1  -0.0    536.0   454.4           M  
740            3.0           9.1  -0.0    546.0   451.4           M  
741            5.6          10.1  -0.0    503.0   502.0           L  


In [46]:
dfTestFeatures = dfTestScaled.drop(['SIZE_HA_BIN', 'YEAR'], axis=1)

# predict on test data
dfTestScaled['PREDICTED_SIZE_HA_BIN1'] = model1.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN2'] = model2.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN3'] = model3.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN4'] = model4.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN5'] = model5.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN6'] = model6.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN7'] = model7.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN8'] = model8.predict(dfTestFeatures)
dfTestScaled['PREDICTED_SIZE_HA_BIN9'] = model9.predict(dfTestFeatures)


In [47]:
# show roc curve
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')  # dashed diagonal
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


In [48]:
# show precision recall curve
def plot_precision_recall_curve(precision, recall, label=None):
    plt.plot(recall, precision, linewidth=2, label=label)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('Recall')
    plt.ylabel('Precision')


In [49]:
# # show accuracy, precision, recall, f1 score
# def show_metrics(y_test, y_pred, labels):
#     print("Accuracy: ", accuracy_score(y_test, y_pred))
#     print("Precision: ", precision_score(
#         y_test, y_pred, labels=labels, average='micro'))
#     print("Recall: ", recall_score(
#         y_test, y_pred, labels=labels, average='micro'))
#     print("F1 Score: ", f1_score(y_test, y_pred, labels=labels, average='micro'))


In [50]:
# show accuracy, precision, recall, f1 score
def show_metrics(y_test, y_pred):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(
        y_test, y_pred, average='micro'))
    print("Recall: ", recall_score(
        y_test, y_pred, average='micro'))
    print("F1 Score: ", f1_score(y_test, y_pred, average='micro'))


In [51]:
labels = ['L', 'M', 'H', 'E']
# show metrics for each model
for i in range(1, 10):
    print("Model ", i)
    show_metrics(dfTestScaled['SIZE_HA_BIN'],
                 dfTestScaled['PREDICTED_SIZE_HA_BIN' + str(i)])


Model  1
Accuracy:  0.21951219512195122
Precision:  0.21951219512195122
Recall:  0.21951219512195122
F1 Score:  0.21951219512195122
Model  2
Accuracy:  0.25609756097560976
Precision:  0.25609756097560976
Recall:  0.25609756097560976
F1 Score:  0.25609756097560976
Model  3
Accuracy:  0.25609756097560976
Precision:  0.25609756097560976
Recall:  0.25609756097560976
F1 Score:  0.25609756097560976
Model  4
Accuracy:  0.23170731707317074
Precision:  0.23170731707317074
Recall:  0.23170731707317074
F1 Score:  0.23170731707317074
Model  5
Accuracy:  0.25609756097560976
Precision:  0.25609756097560976
Recall:  0.25609756097560976
F1 Score:  0.25609756097560976
Model  6
Accuracy:  0.3048780487804878
Precision:  0.3048780487804878
Recall:  0.3048780487804878
F1 Score:  0.3048780487804878
Model  7
Accuracy:  0.3048780487804878
Precision:  0.3048780487804878
Recall:  0.3048780487804878
F1 Score:  0.3048780487804878
Model  8
Accuracy:  0.3048780487804878
Precision:  0.3048780487804878
Recall:  0.304

In [52]:
dfTestScaled

Unnamed: 0,YEAR,MONTH,DAY,ELEVATIONM,DIST_TO_WATER,DAYW,MAXTEMP,MEANHUMIDITY,MEANWINDSPEED,MAXWINDSPEED,RAIN,LONGBIN,LATBIN,SIZE_HA_BIN,PREDICTED_SIZE_HA_BIN1,PREDICTED_SIZE_HA_BIN2,PREDICTED_SIZE_HA_BIN3,PREDICTED_SIZE_HA_BIN4,PREDICTED_SIZE_HA_BIN5,PREDICTED_SIZE_HA_BIN6,PREDICTED_SIZE_HA_BIN7,PREDICTED_SIZE_HA_BIN8,PREDICTED_SIZE_HA_BIN9
737,2017,8.1,5.0,378.6,6284.5,4.0,28.3,61.3,7.1,11.2,-0.0,470.0,486.8,M,E,E,E,H,E,E,E,E,H
738,2017,7.1,21.0,1683.8,625.3,20.0,19.9,54.8,7.1,10.1,-0.0,352.0,543.5,H,E,E,E,H,E,E,E,E,E
739,2017,8.1,5.0,450.5,870.8,4.0,26.0,74.2,3.5,8.1,-0.0,536.0,454.4,M,E,E,E,H,E,E,E,E,H
740,2017,8.1,12.0,454.6,7020.1,11.0,28.8,72.9,3.0,9.1,-0.0,546.0,451.4,M,E,E,E,H,E,E,H,H,H
741,2017,7.1,27.0,409.0,1151.0,26.0,28.6,70.5,5.6,10.1,-0.0,503.0,502.0,L,E,E,E,H,E,E,E,E,H
742,2017,8.1,12.0,487.0,4222.1,11.0,28.8,72.9,3.0,9.1,-0.0,547.0,449.4,L,E,E,E,H,E,E,H,H,H
743,2017,8.1,5.0,432.3,2654.2,4.0,26.0,74.2,3.5,8.1,-0.0,560.0,450.4,L,E,E,E,H,E,E,E,E,H
744,2017,7.1,27.0,421.2,510.7,26.0,28.6,70.5,5.6,10.1,-0.0,522.0,493.9,L,E,E,E,H,E,E,E,E,H
745,2017,8.1,28.0,601.4,1936.0,27.0,26.2,55.4,7.3,11.2,-0.0,344.0,395.7,L,E,E,E,H,E,E,E,E,E
746,2017,8.1,28.0,572.0,2257.9,27.0,26.2,66.8,4.2,9.1,-0.0,363.0,402.8,H,E,E,E,H,E,E,E,E,H
