## Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import seaborn as sns
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    ConfusionMatrixDisplay,
    accuracy_score, 
    precision_score, 
    recall_score
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    StratifiedKFold,
    KFold
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier


## Read Dataset

In [2]:
df = pd.read_csv('Transformed_Data.csv')
print(df.shape)
df.head()

(18900, 20)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area3,Wilderness_Area4,Soil_Type3,Soil_Type4,Soil_Type10,Soil_Type23,Soil_Type29,Soil_Type30,Soil_Type32,Cover_Type
0,0.48875,0.242953,1.579103,-0.24244,-0.103541,0.840184,0.856814,-2.074582,-2.035292,0.828308,1,0,0,0,0,0,0,0,0,1
1,0.923949,0.924811,-1.243057,-0.080129,-0.914803,-0.563925,-0.384762,0.664501,0.749468,-0.02912,0,0,0,0,0,1,0,0,0,1
2,1.30176,0.543035,0.285613,0.287345,-0.751288,1.38065,0.547494,-1.044093,-0.969251,1.453142,1,0,0,0,0,1,0,0,0,1
3,0.503097,0.76748,-1.125467,-0.680049,-0.820478,1.037096,0.285514,0.232543,0.075034,-0.038221,1,0,0,0,0,0,0,0,0,1
4,1.220459,-1.652851,-1.007877,0.835735,0.226095,0.818457,0.21215,1.266796,0.531908,0.212173,1,0,0,0,0,0,0,0,0,1


In [3]:
features = df.drop(columns=["Cover_Type"])
cover_type = df["Cover_Type"]

In [4]:
## Subset of features were selected based on Random Forest model as it is the best model out of all
subset = [
    "Elevation",
    "Horizontal_Distance_To_Roadways",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Hillshade_9am",
    "Aspect",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Wilderness_Area4"
]

features_10 = features[subset]

## 5-fold Cross-validation performance 

In [5]:
## Classification Tree - with all features
ctree = DecisionTreeClassifier()
# Set up the k-fold cross-validator
kfold = 5
kf = KFold(n_splits=kfold, shuffle=True, random_state=447)

# Calculate out-of-sample accuracy
scores = cross_val_score(ctree, features, cover_type , cv=kf, scoring='accuracy')

# Print individual fold accuracies and average accuracy
print(f'Accuracy for each fold: {scores}')
print(f'Average accuracy: {np.mean(scores)}')

Accuracy for each fold: [0.79444444 0.77645503 0.77671958 0.78280423 0.78439153]
Average accuracy: 0.7829629629629629


In [6]:
## Classification Tree - with 10 features
ctree = DecisionTreeClassifier()
# Set up the k-fold cross-validator
kfold = 5
kf = KFold(n_splits=kfold, shuffle=True, random_state=447)

# Calculate out-of-sample accuracy
scores = cross_val_score(ctree, features_10, cover_type , cv=kf, scoring='accuracy')

# Print individual fold accuracies and average accuracy
print(f'Accuracy for each fold: {scores}')
print(f'Average accuracy: {np.mean(scores)}')

Accuracy for each fold: [0.77275132 0.76031746 0.75502646 0.76851852 0.76296296]
Average accuracy: 0.7639153439153439
