# Modelling

## Target Variable
burned area in hectares

## Models
- Neural Network
- Support Vector Machine

## Tasks
1. train-test split
2. cross-validation
3. model training

In [379]:
import pandas as pd
import numpy as np

In [396]:
zscore_ff = pd.read_csv("zscore_ff.csv")
minmax_ff = pd.read_csv("minmax_ff.csv")

In [397]:
from sklearn.model_selection import KFold

In [398]:
zscore_x = zscore_ff.iloc[:,:-1]   # Previously normalized features for model training
zscore_y = zscore_ff.iloc[:, -1]   # Target variable for supervised learning

minmax_x = minmax_ff.iloc[:,:-1]   # Previously normalized features for model training
minmax_y = minmax_ff.iloc[:, -1]   # Target variable for supervised learning

In [399]:
# Define the k-fold cross-validation method
k:int = 5
kf = KFold(
    n_splits=k, random_state=None, shuffle=True
)

In [400]:
from sklearn.svm import SVR

In [401]:
zscore_svm = SVR(
    kernel="rbf",
    epsilon=1
)

minmax_svm = SVR(
    kernel="rbf",
    epsilon=1
)

In [402]:
import tensorflow as tf
from keras.layers import Dense, Input

In [403]:
input_dim_z:int = zscore_x.shape[1] # 12 input features for prediction
input_dim_m:int = minmax_x.shape[1] # 12 input features for prediction

z_neurons_1:int = 256                # number of neurons in first hidden layer
z_neurons_2:int = 128                # number of neurons in second hidden layer
z_neurons_3:int = 32                 # number of neurons in third hidden layer

m_neurons_1:int = 32                # number of neurons in first hidden layer
m_neurons_2:int = 16                # number of neurons in second hidden layer
m_neurons_3:int = 8                 # number of neurons in third hidden layer

output_dim:int = 1                  # 1 target for regression task

epoch_no:int = 10                   # Number of complete pass throughs of the dataset
batch_size:int = 8                  # Number of training samples used per iteration

In [404]:
zscore_tf = tf.keras.Sequential()

zscore_tf.add(Dense(input_dim_z, activation="relu"))
zscore_tf.add(Dense(z_neurons_1, activation="relu"))
zscore_tf.add(Dense(z_neurons_2, activation="relu"))
zscore_tf.add(Dense(z_neurons_3, activation="relu"))
zscore_tf.add(Dense(output_dim, activation="relu"))

zscore_tf.compile(
    optimizer="adam",
    loss="mse",
    metrics=["accuracy"]
)

In [405]:
minmax_tf = tf.keras.Sequential()

minmax_tf.add(Dense(input_dim_z, activation="relu"))
minmax_tf.add(Dense(m_neurons_1, activation="relu"))
minmax_tf.add(Dense(m_neurons_2, activation="relu"))
minmax_tf.add(Dense(m_neurons_3, activation="relu"))
minmax_tf.add(Dense(output_dim, activation="relu"))

minmax_tf.compile(
    optimizer="adam",
    loss="mse",
    metrics=["accuracy"]
)

In [406]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [407]:
z_svm_mse = []
z_tf_mse = []

m_svm_mse = []
m_tf_mse = []

In [408]:
for train_index, test_index in kf.split(X=zscore_x, y=zscore_y):
    X_train, X_test = zscore_x.iloc[train_index], zscore_x.iloc[test_index]
    y_train, y_test = zscore_y.iloc[train_index], zscore_y.iloc[test_index]
    
    # Fit the model to the current split
    # Each split is independent from another
    zscore_svm.fit(
        X=X_train, y=y_train
    )
    zscore_tf.fit(
        X_train, y_train, epochs=epoch_no, batch_size=batch_size, verbose=0
    )

    # Make predictions based on the current train-test split
    svm_pred = zscore_svm.predict(X=X_test)
    model_pred = zscore_tf.predict(X_test)

    # Add r-square score to list for each split
    z_svm_mse.append(
        mean_squared_error(y_true=y_test, y_pred=svm_pred)
    )
    z_tf_mse.append(
        mean_squared_error(y_true=y_test, y_pred=model_pred)
    )

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [409]:
for train_index, test_index in kf.split(X=minmax_x, y=minmax_y):
    X_train, X_test = minmax_x.iloc[train_index], minmax_x.iloc[test_index]
    y_train, y_test = minmax_y.iloc[train_index], minmax_y.iloc[test_index]
    
    # Fit the model to the current split
    # Each split is independent from another
    minmax_svm.fit(
        X=X_train, y=y_train
    )
    minmax_tf.fit(
        X_train, y_train, epochs=epoch_no, batch_size=batch_size, verbose=0
    )

    # Make predictions based on the current train-test split
    svm_pred = minmax_svm.predict(X=X_test)
    model_pred = minmax_tf.predict(X_test)

    # Add r-square score to list for each split
    m_svm_mse.append(
        mean_squared_error(y_true=y_test, y_pred=svm_pred)
    )
    m_tf_mse.append(
        mean_squared_error(y_true=y_test, y_pred=model_pred)
    )

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [410]:
# MSE for each cross-validation instance
print(z_svm_mse)
print(z_tf_mse)

# Average MSE for all cross-validation instances
print("SVM MSE:", sum(z_svm_mse)/len(z_svm_mse))
print("NN MSE", sum(z_tf_mse)/len(z_tf_mse))

# Average error in hectares
svm_x = np.sqrt(sum(z_svm_mse)/len(z_svm_mse))
tf_x = np.sqrt(sum(z_tf_mse)/len(z_tf_mse))

print("SVM MSE hectares", np.exp(svm_x)-1)
print("NN MSE hectares", np.exp(tf_x)-1)

[1.7465310953804492, 2.1626130817612994, 2.081075633542362, 1.9419463463482372, 1.742291820180092]
[2.0299215098015564, 2.1602199343883592, 1.8737912769220086, 1.2990418026516666, 0.9156373540936971]
SVM MSE: 1.934891595442488
NN MSE 1.6557223755714578
SVM MSE hectares 3.018882217286757
NN MSE hectares 2.6209946802203428


In [411]:
# MSE for each cross-validation instance
print(m_svm_mse)
print(m_tf_mse)

# Average MSE for all cross-validation instances
print("SVM MSE:", sum(m_svm_mse)/len(m_svm_mse))
print("NN MSE", sum(m_tf_mse)/len(m_tf_mse))

# Average error in hectares
svm_x = np.sqrt(sum(m_svm_mse)/len(m_svm_mse))
tf_x = np.sqrt(sum(m_tf_mse)/len(m_tf_mse))

print("SVM MSE hectares", np.exp(svm_x)-1)
print("NN MSE hectares", np.exp(tf_x)-1)

[1.7431117127044222, 1.8644533387275064, 1.332753545381297, 2.397413218151492, 2.319460713329401]
[1.747680216206736, 1.9529349962853657, 1.3042910824822966, 2.256343431783386, 2.2439228223885443]
SVM MSE: 1.9314385056588237
NN MSE 1.901034509829266
SVM MSE hectares 3.0138947609269477
NN MSE hectares 2.9700555232197954
