Import Libraries

In [188]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

Import dataset

In [189]:
df = pd.read_csv("../data/raw/dataset.csv")

In [190]:
df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use,Target
0,New Brunswick,,19.591444,,Grassland,272.801193,,Industrial,No Fire
1,,10.180076,19.147137,84.011601,Forest,716.714059,Summer,Agricultural,No Fire
2,Ontario,28.640759,19.011429,96.870673,Wetland,105.972024,Spring,Agricultural,No Fire
3,Newfoundland and Labrador,26.867228,20.79373,52.13722,Grassland,454.735284,Fall,Commercial,No Fire
4,Ontario,0.763344,20.242969,40.444907,Grassland,757.547237,Spring,Industrial,Medium Risk


In [191]:
df.shape

(900, 9)

In [192]:
cat_columns = [col for col in df.columns if df[col].dtype == "O" and col != "Target"]
num_columns = [col for col in df.columns if df[col].dtype != "O"]

In [193]:
for col in cat_columns:
    df[col].astype("category")

In [194]:
# Fill missing values and store the median and mode
modes = {}
medians = {}

for col in cat_columns:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)
    modes[col] = mode_value
    print(f"The mode for column {col} is: {mode_value}")

print("\n")
print("*" * 100)
print("\n")

for col in num_columns:
    median_value = float(round(df[col].median(), 2))
    df[col].fillna(median_value, inplace=True)
    medians[col] = median_value
    print(f"The median for column {col} is: {median_value}")

The mode for column Province is: Saskatchewan
The mode for column Vegetation_Type is: Forest
The mode for column Fire_Seasonality is: Fall
The mode for column Land_Use is: Industrial


****************************************************************************************************


The median for column Temperature is: 12.94
The median for column Oxygen is: 19.99
The median for column Humidity is: 56.07
The median for column Drought_Index is: 394.28


In [195]:
print(modes)
print(medians)

{'Province': 'Saskatchewan', 'Vegetation_Type': 'Forest', 'Fire_Seasonality': 'Fall', 'Land_Use': 'Industrial'}
{'Temperature': 12.94, 'Oxygen': 19.99, 'Humidity': 56.07, 'Drought_Index': 394.28}


In [196]:
#sanity check
df.isnull().sum()

Province            0
Temperature         0
Oxygen              0
Humidity            0
Vegetation_Type     0
Drought_Index       0
Fire_Seasonality    0
Land_Use            0
Target              0
dtype: int64

Machine Learning

In [197]:
df["Target"].unique()

array(['No Fire', 'Medium Risk', 'High Risk', 'Low Risk'], dtype=object)

In [198]:
map_target_column = {"No Fire": 0, "Low Risk":1, "Medium Risk": 2, "High Risk":3}

In [199]:
df["Target"] = df["Target"].map(map_target_column)
df["Target"].astype("category")

0      0
1      0
2      0
3      0
4      2
      ..
895    2
896    2
897    1
898    1
899    0
Name: Target, Length: 900, dtype: category
Categories (4, int64): [0, 1, 2, 3]

In [200]:
#sanity check
df["Target"].unique()

array([0, 2, 3, 1])

Save these medians, modes, target mapping in YAML file

In [201]:
# Store median and mode in a YAML file
import yaml
preprocessing_params = {
    'modes': modes,
    'medians': medians,
    'map_target_column': map_target_column
}

with open("../params.yaml", "w") as file:
    yaml.dump(preprocessing_params, file)

In [202]:
preprocessing_params

{'modes': {'Province': 'Saskatchewan',
  'Vegetation_Type': 'Forest',
  'Fire_Seasonality': 'Fall',
  'Land_Use': 'Industrial'},
 'medians': {'Temperature': 12.94,
  'Oxygen': 19.99,
  'Humidity': 56.07,
  'Drought_Index': 394.28},
 'map_target_column': {'No Fire': 0,
  'Low Risk': 1,
  'Medium Risk': 2,
  'High Risk': 3}}

Separate dataset into train and test

In [203]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2,random_state=42)

df_train.shape, df_test.shape

((720, 9), (180, 9))

In [204]:
X_train = df_train[cat_columns + num_columns]
y_train = df_train["Target"]

X_test = df_test[cat_columns + num_columns]
y_test = df_test["Target"]

Preprocessing for Categorical & Numerical columns

In [205]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_columns),
         ("StandardScaler", numeric_transformer, num_columns),        
    ]
)

In [206]:
# X_train_processed = preprocessor.fit_transform(X_train)
# X_test_processed = preprocessor.transform(X_test)

In [207]:
from sklearn.pipeline import Pipeline

In [208]:
baseline_model = Pipeline(steps= [("preprocessor", preprocessor),
                          ("lr", LogisticRegression())
                          ])

In [209]:
baseline_model.fit(X_train, y_train)

In [210]:
y_pred = baseline_model.predict(X_test)

In [211]:
print("Clasification Report: \n", classification_report(y_pred, y_test))

Clasification Report: 
               precision    recall  f1-score   support

           0       0.77      0.38      0.51       148
           1       0.00      0.00      0.00         5
           2       0.05      0.12      0.07        16
           3       0.08      0.18      0.11        11

    accuracy                           0.33       180
   macro avg       0.23      0.17      0.17       180
weighted avg       0.64      0.33      0.43       180



In [212]:
print("Confusion Matrix: \n", confusion_matrix(y_pred, y_test))

Confusion Matrix: 
 [[56 36 36 20]
 [ 4  0  0  1]
 [ 6  7  2  1]
 [ 7  2  0  2]]


Save the baseline Model

In [213]:
import joblib

In [214]:
joblib.dump(baseline_model, "../artiifacts/baseline-model.joblib")

['../artiifacts/baseline-model.joblib']

Scoring

In [215]:
baseline_model = joblib.load("../artiifacts/baseline-model.joblib")

In [216]:
# Load preprocessing parameters
with open('../params.yaml') as file:
    preprocessing_params = yaml.load(file, Loader=yaml.FullLoader)

modes = preprocessing_params['modes']
medians = preprocessing_params['medians']
map_target_column = preprocessing_params['map_target_column']

In [235]:
# Sample new data (to be scored)
new_data = {
    'Province': ['Ontario', 'Alberta', 'Quebec'],
    'Temperature': [22.5, 15.0, 18.0],
    'Oxygen': [20.5, 21.0, 19.5],
    'Humidity': [50, 55, 60],
    'Vegetation_Type': ['Forest', 'Grassland', 'Shrubland'],
    'Drought_Index': [400, 350, 300],
    'Fire_Seasonality': ['Spring', 'Summer', 'Fall'],
    'Land_Use': ['Agricultural', 'Residential', 'Industrial']
}

new_df = pd.DataFrame(new_data)

new_df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use
0,Ontario,22.5,20.5,50,Forest,400,Spring,Agricultural
1,Alberta,15.0,21.0,55,Grassland,350,Summer,Residential
2,Quebec,18.0,19.5,60,Shrubland,300,Fall,Industrial


In [236]:
# Fill missing values using the same median and mode values
for col in modes:
    new_df[col].fillna(modes[col], inplace=True)

for col in medians:
    new_df[col].fillna(medians[col], inplace=True)

In [237]:
baseline_model

In [238]:
# Score the new data
predictions = baseline_model.predict(new_df)
predictions

array([0, 0, 0])

In [240]:
# Convert predictions back to original target labels
inverse_map_target_column = {v: k for k, v in map_target_column.items()}
predicted_labels = [inverse_map_target_column[pred] for pred in predictions]

In [241]:
new_df["Predictions"] = predicted_labels

In [242]:
new_df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use,Predictions
0,Ontario,22.5,20.5,50,Forest,400,Spring,Agricultural,No Fire
1,Alberta,15.0,21.0,55,Grassland,350,Summer,Residential,No Fire
2,Quebec,18.0,19.5,60,Shrubland,300,Fall,Industrial,No Fire
