## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    cross_val_score,
    KFold
)
from sklearn.naive_bayes import GaussianNB

## Read Dataset

In [2]:
df = pd.read_csv("Transformed_Data.csv")
print(df.shape)
df.head()

(18900, 20)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area3,Wilderness_Area4,Soil_Type3,Soil_Type4,Soil_Type10,Soil_Type23,Soil_Type29,Soil_Type30,Soil_Type32,Cover_Type
0,0.48875,0.242953,1.579103,-0.24244,-0.103541,0.840184,0.856814,-2.074582,-2.035292,0.828308,1,0,0,0,0,0,0,0,0,1
1,0.923949,0.924811,-1.243057,-0.080129,-0.914803,-0.563925,-0.384762,0.664501,0.749468,-0.02912,0,0,0,0,0,1,0,0,0,1
2,1.30176,0.543035,0.285613,0.287345,-0.751288,1.38065,0.547494,-1.044093,-0.969251,1.453142,1,0,0,0,0,1,0,0,0,1
3,0.503097,0.76748,-1.125467,-0.680049,-0.820478,1.037096,0.285514,0.232543,0.075034,-0.038221,1,0,0,0,0,0,0,0,0,1
4,1.220459,-1.652851,-1.007877,0.835735,0.226095,0.818457,0.21215,1.266796,0.531908,0.212173,1,0,0,0,0,0,0,0,0,1


## Separate response and predictor variables

In [3]:
features = df.drop(columns=["Cover_Type"])
cover_type = df["Cover_Type"]

In [4]:
## Subset of 10 features were selected based on Random Forest model as it is the best model out of all
subset1 = [
    "Elevation",
    "Horizontal_Distance_To_Roadways",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Hillshade_9am",
    "Aspect",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Wilderness_Area4"
]

## Since there is strong correlation between Aspect and Hillshade_3pm 
## as well as Horizontal_Distance_To_Hydrology and Vertical_Distance_To_Hydrology
## we decided to remove Hillshade_3pm and Vertical_Distance_To_Hydrology.
## We tried a different combinations of 6 features,
## found that the following features improved the accuracy of Naive Bayes Classifier.
subset2 = [
    "Elevation",
    "Horizontal_Distance_To_Roadways",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Hydrology",
    "Hillshade_9am",
    "Aspect"
]

features_10 = features[subset1]
features_6 = features[subset2]

## Naive Bayes Classifier - with all features

In [5]:
## Set yp the Naive Bayes Classifier model
nbc = GaussianNB()

# Set up the k-fold cross-validator
kfold = 5
kf = KFold(n_splits=kfold, shuffle=True, random_state=447)

# Calculate out-of-sample accuracy
scores = cross_val_score(nbc, features, cover_type , cv=kf, scoring='accuracy')

# Print individual fold accuracies and average accuracy
results = pd.DataFrame({'Fold': np.arange(1, kfold + 1), 'Accuracy': scores})
print(results)
print(f'Average accuracy: {np.mean(scores)}')

   Fold  Accuracy
0     1  0.442328
1     2  0.437302
2     3  0.452381
3     4  0.446296
4     5  0.436772
Average accuracy: 0.44301587301587303


## Naive Bayes Classifier - with 10 features

In [6]:
scores = cross_val_score(nbc, features_10, cover_type , cv=kf, scoring='accuracy')
# Print individual fold accuracies and average accuracy
results = pd.DataFrame({'Fold': np.arange(1, kfold + 1), 'Accuracy': scores})
print(results)
print(f'Average accuracy: {np.mean(scores)}')

   Fold  Accuracy
0     1  0.499735
1     2  0.505291
2     3  0.485979
3     4  0.506614
4     5  0.494180
Average accuracy: 0.49835978835978834


## Naive Bayes Classifier - with 6 features

In [7]:
scores = cross_val_score(nbc, features_6, cover_type , cv=kf, scoring='accuracy')
# Print individual fold accuracies and average accuracy
results = pd.DataFrame({'Fold': np.arange(1, kfold + 1), 'Accuracy': scores})
print(results)
print(f'Average accuracy: {np.mean(scores)}')

   Fold  Accuracy
0     1  0.641799
1     2  0.625926
2     3  0.618783
3     4  0.635450
4     5  0.623810
Average accuracy: 0.6291534391534392


As seen, Naive Bayes Classifier with 6 features yields the best out-of-sample accuracy.