Import Libraries

In [4]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBClassifier, XGBRFClassifier

import warnings

Import dataset

In [42]:
df = pd.read_csv("../data/raw/dataset.csv")

In [43]:
df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use,Target
0,New Brunswick,,19.591444,,Grassland,272.801193,,Industrial,No Fire
1,,10.180076,19.147137,84.011601,Forest,716.714059,Summer,Agricultural,No Fire
2,Ontario,28.640759,19.011429,96.870673,Wetland,105.972024,Spring,Agricultural,No Fire
3,Alberta,31.265827,19.939055,65.530465,Wetland,515.947829,Winter,,
4,Newfoundland and Labrador,26.867228,20.79373,52.13722,Grassland,454.735284,Fall,Commercial,No Fire


Drop Null values

In [44]:
scoring_dataset = df[df["Target"].isnull()]

In [45]:
df = df.dropna(subset="Target").reset_index(drop=True)
df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use,Target
0,New Brunswick,,19.591444,,Grassland,272.801193,,Industrial,No Fire
1,,10.180076,19.147137,84.011601,Forest,716.714059,Summer,Agricultural,No Fire
2,Ontario,28.640759,19.011429,96.870673,Wetland,105.972024,Spring,Agricultural,No Fire
3,Newfoundland and Labrador,26.867228,20.79373,52.13722,Grassland,454.735284,Fall,Commercial,No Fire
4,Ontario,0.763344,20.242969,40.444907,Grassland,757.547237,Spring,Industrial,Medium Risk


In [46]:
df.shape

(900, 9)

In [47]:
cat_columns = [col for col in df.columns if df[col].dtype == "O" and col != "Target"]
num_columns = [col for col in df.columns if df[col].dtype != "O"]

In [56]:
for col in cat_columns:
    df[col].astype("category")

In [57]:
for col in cat_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

for col in num_columns:
    df[col].fillna(df[col].median(), inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [49]:
#sanity check
df.isnull().sum()

Province            0
Temperature         0
Oxygen              0
Humidity            0
Vegetation_Type     0
Drought_Index       0
Fire_Seasonality    0
Land_Use            0
Target              0
dtype: int64

Machine Learning

In [51]:
df["Target"].unique()

array(['No Fire', 'Medium Risk', 'High Risk', 'Low Risk'], dtype=object)

In [52]:
map_target_column = {"No Fire": 0, "Low Risk":1, "Medium Risk": 2, "High Risk":3}

In [54]:
df["Target"] = df["Target"].map(map_target_column)
df["Target"].astype("category")

0      0
1      0
2      0
3      0
4      2
      ..
895    2
896    2
897    1
898    1
899    0
Name: Target, Length: 900, dtype: category
Categories (4, int64): [0, 1, 2, 3]

In [55]:
#sanity check

df["Target"].unique()

array([0, 2, 3, 1])

Separate dataset into train and test

In [32]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2,random_state=42)

df_train.shape, df_test.shape

((720, 9), (180, 9))

In [35]:
X_train = df_train[cat_columns + num_columns]
y_train = df_train["Target"]

X_test = df_test[cat_columns + num_columns]
y_test = df_test["Target"]

In [36]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_columns),
         ("StandardScaler", numeric_transformer, num_columns),        
    ]
)

In [38]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [58]:
LR = LogisticRegression()

In [61]:
LR.fit(X_train_processed, y_train)

In [64]:
y_pred = LR.predict(X_test_processed)

In [68]:
print("Clasification Report: ", classification_report(y_pred, y_test))

Clasification Report:                precision    recall  f1-score   support

   High Risk       0.08      0.18      0.11        11
    Low Risk       0.00      0.00      0.00         5
 Medium Risk       0.05      0.12      0.07        16
     No Fire       0.77      0.38      0.51       148

    accuracy                           0.33       180
   macro avg       0.23      0.17      0.17       180
weighted avg       0.64      0.33      0.43       180



In [69]:
print("Confusion Matrix: ", confusion_matrix(y_pred, y_test))

Confusion Matrix:  [[ 2  2  0  7]
 [ 1  0  0  4]
 [ 1  7  2  6]
 [20 36 36 56]]
