Title

Description

Importing packages.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

Importing data into pandas DataFrame.

In [20]:
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv(r"fullDataFinal.csv", index_col=0)
df.columns = df.columns.str.strip()
df.sample(5)

Unnamed: 0,fire_size_class,latitude,longitude,discovery_month,weekday,Vegetation,remoteness,elevation,Temp,Wind,Humidity,Precipitation
26355,B,34.5583,-95.436096,Apr,True,Broadleaf Forest,0.256849,171.0,11.973671,4.575608,55.258984,56.0
21750,B,35.345,-79.231666,May,True,Rock,0.109246,119.0,19.041953,1.884228,71.30006,25.633333
13382,B,35.9167,-77.2833,Feb,True,Rock,0.09183,26.0,8.848082,2.856043,54.896172,0.0
1347,B,38.67778,-75.98361,Jun,True,,0.079015,9.0,21.00984,2.059761,75.557838,0.0
5532,B,41.221674,-73.708792,Aug,False,Rock,0.058205,161.0,22.595147,2.470767,72.336829,78.933333


Separate target variable (fire_size_class) from predictor variables.

In [28]:
dfTarget = df["fire_size_class"]
dfPredictor = df[["latitude", "longitude", "discovery_month", "weekday", "Vegetation", "remoteness", "elevation", "Temp", "Wind", "Humidity", "Precipitation"]]

Changing categorical variables into one-hot encoded ones.

In [25]:
# First let's do fire_size_class
dfTarget = pd.concat([dfTarget, pd.get_dummies(dfTarget)], axis=1)  # use get_dummies and concatenate the result to dfTarget
dfTarget.drop(["fire_size_class"], axis=1, inplace=True)  # and remove the old column
dfTarget.sample(5)

Unnamed: 0,B,C,D,E,F,G
3805,1,0,0,0,0,0
21623,1,0,0,0,0,0
11211,1,0,0,0,0,0
5831,1,0,0,0,0,0
11141,1,0,0,0,0,0


In [29]:
# Now Vegetation
dfPredictor = pd.concat([dfPredictor, pd.get_dummies(dfPredictor["Vegetation"], prefix="Veg", prefix_sep=" ")], axis=1)
dfPredictor.drop(["Vegetation"], axis=1, inplace=True)

# And discovery_month
dfPredictor = pd.concat([dfPredictor, pd.get_dummies(dfPredictor["discovery_month"])], axis=1)
dfPredictor.drop(["discovery_month"], axis=1, inplace=True)

# And weekday
dfPredictor = pd.concat([dfPredictor, pd.get_dummies(dfPredictor["weekday"])], axis=1)
dfPredictor.drop(["weekday"], axis=1, inplace=True)
dfPredictor.rename(columns={False:"is_weekend", True:"is_weekday"}, inplace=True)
dfPredictor.sample(5)

Unnamed: 0,latitude,longitude,remoteness,elevation,Temp,Wind,Humidity,Precipitation,Veg Broadleaf Forest,Veg Desert,...,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep,is_weekend,is_weekday
6808,36.1583,-89.375,0.199774,114.0,25.201349,2.250595,62.566236,4.066667,0,0,...,0,0,0,0,0,0,0,0,0,1
26506,47.87,-94.9794,0.247158,377.0,4.079034,4.346316,68.541329,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
34095,63.583,-142.4689,0.541358,0.0,12.450442,3.435411,46.540814,0.0,0,0,...,0,0,1,0,0,0,0,0,1,0
35684,43.12111,-115.35334,0.044249,1138.0,27.254121,4.519601,34.656949,0.8,0,0,...,0,0,0,0,0,0,0,0,0,1
19788,34.226111,-109.958333,0.39668,2095.0,22.805297,4.828272,23.526771,0.0,1,0,...,0,0,1,0,0,0,0,0,0,1


Scaling numerical data to 0-1 range.

In [30]:
from sklearn.preprocessing import MinMaxScaler

dfPredictorScaled = pd.DataFrame(MinMaxScaler().fit_transform(dfPredictor[["latitude", "longitude", "remoteness", "elevation", "Temp", "Wind", "Humidity", "Precipitation"]]), columns=["latitude", "longitude", "remoteness", "elevation", "Temp", "Wind", "Humidity", "Precipitation"])
# dfPredictorScaled.sample(5)

dfPredictor = dfPredictor.assign(**dfPredictorScaled.to_dict(orient="series"))  # replace the columns of the old df with the ones from the new one
dfPredictor.sample(5)

Unnamed: 0,latitude,longitude,remoteness,elevation,Temp,Wind,Humidity,Precipitation,Veg Broadleaf Forest,Veg Desert,...,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep,is_weekend,is_weekday
31326,0.27964,0.840855,0.129439,0.024685,0.615659,0.113919,0.82752,0.004082,0,0,...,0,0,0,0,0,0,0,1,1,0
33674,0.558453,0.576295,0.053794,0.298409,0.613024,0.113936,0.42978,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
17355,0.305251,0.788459,0.175325,0.081733,0.455078,0.131285,0.724636,0.012173,1,0,...,0,0,0,0,0,0,0,0,0,1
6610,0.304535,0.836853,0.132092,0.059517,0.685959,0.090158,0.774746,0.002277,1,0,...,0,1,0,0,0,0,0,0,0,1
6615,0.370797,0.448066,0.500453,0.030444,0.605677,0.136471,0.49921,0.001972,0,0,...,0,0,1,0,0,0,0,0,0,1


Train-test split

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfPredictor, dfTarget, test_size=0.1, random_state=42)

Training models, each in a cell (?)

* K-Nearest Neighbour (KNN)
* Naïve Bayes (NB)
* Support Vector Machine (SVM)
* Decision Tree (DT)
* Random Forest (RF)
* Gradient Boosted Trees (GBT)
* Multi-Layered Perceptron (MLP)
* Artificial Neural Network (ANN)

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import time  # optional for timing the code

training_accuracy = []
test_accuracy = []
nNeighbors = range(1, 11, 2)

tic = time.perf_counter()  # timing start
for nNeighbor in nNeighbors:
    # build the model
    clf = KNeighborsClassifier(n_neighbors=nNeighbor, n_jobs=-1)
    clf.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(clf.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy.append(clf.score(X_test, y_test))
toc = time.perf_counter()  # timing end
print(f"Trained in in {toc - tic:0.4f} seconds")  # print the time elapsed

plt.plot(nNeighbors, training_accuracy, label="training accuracy")
plt.plot(nNeighbors, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()