Import Libraries

In [57]:
#Import 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

import yaml
import joblib

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

Import dataset

In [58]:
df = pd.read_csv("../data/raw/dataset.csv")

In [59]:
df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use,Target
0,New Brunswick,,19.591444,,Grassland,272.801193,,Industrial,No Fire
1,,10.180076,19.147137,84.011601,Forest,716.714059,Summer,Agricultural,No Fire
2,Ontario,28.640759,19.011429,96.870673,Wetland,105.972024,Spring,Agricultural,No Fire
3,Newfoundland and Labrador,26.867228,20.79373,52.13722,Grassland,454.735284,Fall,Commercial,No Fire
4,Ontario,0.763344,20.242969,40.444907,Grassland,757.547237,Spring,Industrial,Medium Risk


In [60]:
df.shape

(900, 9)

In [61]:
cat_columns = [col for col in df.columns if df[col].dtype == "O" and col != "Target"]
num_columns = [col for col in df.columns if df[col].dtype != "O"]

In [62]:
for col in cat_columns:
    df[col].astype("category")

In [63]:
#Fill missing values and store the median and mode
modes = {}
medians = {}

for col in cat_columns:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)
    modes[col] = mode_value
    print(f"The mode for column {col} is: {mode_value}")

print("\n")
print("*" * 100)
print("\n")

for col in num_columns:
    median_value = float(round(df[col].median(), 2))
    df[col].fillna(median_value, inplace=True)
    medians[col] = median_value
    print(f"The median for column {col} is: {median_value}")

The mode for column Province is: Saskatchewan
The mode for column Vegetation_Type is: Forest
The mode for column Fire_Seasonality is: Fall
The mode for column Land_Use is: Industrial


****************************************************************************************************


The median for column Temperature is: 12.94
The median for column Oxygen is: 19.99
The median for column Humidity is: 56.07
The median for column Drought_Index is: 394.28


In [64]:
print(modes)
print(medians)

{'Province': 'Saskatchewan', 'Vegetation_Type': 'Forest', 'Fire_Seasonality': 'Fall', 'Land_Use': 'Industrial'}
{'Temperature': 12.94, 'Oxygen': 19.99, 'Humidity': 56.07, 'Drought_Index': 394.28}


In [65]:
#sanity check
df.isnull().sum()

Province            0
Temperature         0
Oxygen              0
Humidity            0
Vegetation_Type     0
Drought_Index       0
Fire_Seasonality    0
Land_Use            0
Target              0
dtype: int64

Machine Learning

In [66]:
df["Target"].unique()

array(['No Fire', 'Medium Risk', 'High Risk', 'Low Risk'], dtype=object)

In [67]:
# map_target_column = {"No Fire": 0, "Low Risk":1, "Medium Risk": 2, "High Risk":3}

In [68]:
# df["Target"] = df["Target"].map(map_target_column)
df["Target"].astype("category")

0          No Fire
1          No Fire
2          No Fire
3          No Fire
4      Medium Risk
          ...     
895    Medium Risk
896    Medium Risk
897       Low Risk
898       Low Risk
899        No Fire
Name: Target, Length: 900, dtype: category
Categories (4, object): ['High Risk', 'Low Risk', 'Medium Risk', 'No Fire']

In [69]:
#sanity check
df["Target"].unique()

array(['No Fire', 'Medium Risk', 'High Risk', 'Low Risk'], dtype=object)

Save these medians, modes, target mapping in YAML file

In [70]:
# Store median, mode, cat_column, nu_column, target_column in a YAML file for reproducibility
import yaml
preprocessing_params = {
    "cat_columns": cat_columns,
    "num_columns": num_columns,
    "target_column": "Target",
    'modes': modes,
    'medians': medians,
}

with open("../artifacts/yaml/preprocessing-params.yaml", "w") as file:
    yaml.dump(preprocessing_params, file)

In [71]:
preprocessing_params

{'cat_columns': ['Province',
  'Vegetation_Type',
  'Fire_Seasonality',
  'Land_Use'],
 'num_columns': ['Temperature', 'Oxygen', 'Humidity', 'Drought_Index'],
 'target_column': 'Target',
 'modes': {'Province': 'Saskatchewan',
  'Vegetation_Type': 'Forest',
  'Fire_Seasonality': 'Fall',
  'Land_Use': 'Industrial'},
 'medians': {'Temperature': 12.94,
  'Oxygen': 19.99,
  'Humidity': 56.07,
  'Drought_Index': 394.28}}

Separate dataset into train and test

In [72]:
df_train, df_test = train_test_split(df, test_size=0.2,random_state=42)

df_train.shape, df_test.shape

((720, 9), (180, 9))

In [73]:
X_train = df_train[cat_columns + num_columns]
y_train = df_train["Target"]

X_test = df_test[cat_columns + num_columns]
y_test = df_test["Target"]

Preprocessing for Categorical & Numerical columns

In [74]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer([("OneHotEncoder", oh_transformer, cat_columns),
                                ("StandardScaler", numeric_transformer, num_columns),])

In [75]:
baseline_model = Pipeline(steps= [("preprocessor", preprocessor), ("lr", LogisticRegression())])

In [76]:
baseline_model.fit(X_train, y_train)

In [77]:
y_pred = baseline_model.predict(X_test)

In [78]:
print("Clasification Report: \n", classification_report(y_pred, y_test))

Clasification Report: 
               precision    recall  f1-score   support

   High Risk       0.08      0.18      0.11        11
    Low Risk       0.00      0.00      0.00         5
 Medium Risk       0.05      0.12      0.07        16
     No Fire       0.77      0.38      0.51       148

    accuracy                           0.33       180
   macro avg       0.23      0.17      0.17       180
weighted avg       0.64      0.33      0.43       180



In [79]:
print("Confusion Matrix: \n", confusion_matrix(y_pred, y_test))

Confusion Matrix: 
 [[ 2  2  0  7]
 [ 1  0  0  4]
 [ 1  7  2  6]
 [20 36 36 56]]


Save the baseline Model

In [80]:
joblib.dump(baseline_model, "../artifacts/baseline-model.joblib")

['../artifacts/baseline-model.joblib']

Scoring

In [81]:
baseline_model = joblib.load("../artifacts/baseline-model.joblib")

In [82]:
# Load preprocessing parameters from the yaml file
with open('../artifacts/yaml/preprocessing-params.yaml') as file:
    preprocessing_params = yaml.load(file, Loader=yaml.FullLoader)

modes = preprocessing_params['modes']
medians = preprocessing_params['medians']

In [83]:
# example shows how to score new data coming in 
scoring_data = {
    'Province': ['Ontario', 'Alberta', 'Quebec'],
    'Temperature': [22.5, 15.0, 18.0],
    'Oxygen': [20.5, 21.0, 19.5],
    'Humidity': [50, 55, 60],
    'Vegetation_Type': ['Forest', 'Grassland', 'Shrubland'],
    'Drought_Index': [400, 350, 300],
    'Fire_Seasonality': ['Spring', 'Summer', 'Fall'],
    'Land_Use': ['Agricultural', 'Residential', 'Industrial']
}

scoring_df = pd.DataFrame(scoring_data)

scoring_df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use
0,Ontario,22.5,20.5,50,Forest,400,Spring,Agricultural
1,Alberta,15.0,21.0,55,Grassland,350,Summer,Residential
2,Quebec,18.0,19.5,60,Shrubland,300,Fall,Industrial


In [84]:
# Fill missing values using the same median and mode values to maintain reproducibility 
for col in modes:
    scoring_df[col].fillna(modes[col], inplace=True)

for col in medians:
    scoring_df[col].fillna(medians[col], inplace=True)

In [85]:
baseline_model

In [86]:
# Score the new data
predictions = baseline_model.predict(scoring_df)
predictions

array(['No Fire', 'No Fire', 'No Fire'], dtype=object)

In [87]:
# Convert predictions back to original target labels
# inverse_map_target_column = {v: k for k, v in map_target_column.items()}
# predicted_labels = [inverse_map_target_column[pred] for pred in predictions]

In [88]:
scoring_df["Predictions"] = predictions

In [89]:
scoring_df.head()

Unnamed: 0,Province,Temperature,Oxygen,Humidity,Vegetation_Type,Drought_Index,Fire_Seasonality,Land_Use,Predictions
0,Ontario,22.5,20.5,50,Forest,400,Spring,Agricultural,No Fire
1,Alberta,15.0,21.0,55,Grassland,350,Summer,Residential,No Fire
2,Quebec,18.0,19.5,60,Shrubland,300,Fall,Industrial,No Fire


Create ML Pipeline for downstream usage

In [90]:
dataset_file_path = "data/raw/dataset.csv"
model_pipeline_path = "../artiifacts/baseline-model.joblib"
preprocessing_params_file = "../artiifacts/yaml/preprocessing-params.yaml"


def read_dataset(dataset_file_path):
    """Read dataset from the spicified location"""
    df = pd.read_csv(dataset_file_path)
    return df

def split_dataset(df, cat_columns, num_columns, target):
    """split dataset in train and test"""

    df_train, df_test = train_test_split(df, test_size=0.2,random_state=42)
    X_train = df_train[cat_columns + num_columns]
    y_train = df_train[target]

    X_test = df_test[cat_columns + num_columns]
    y_test = df_test[target]

    return X_train, y_train, X_test, y_test


def load_preprocessing_params(preprocessing_params_file):
    """Load preprocessing parameters from yaml file in the artifacts folder"""
    with open(preprocessing_params_file) as file:
        preprocessing_params = yaml.load(file, Loader=yaml.FullLoader)

    modes = preprocessing_params["modes"]
    medians = preprocessing_params["medians"]
    num_columns = preprocessing_params["num_columns"]
    cat_columns = preprocessing_params["cat_columns"]
    target = preprocessing_params["target_column"]

    return modes, medians, num_columns, cat_columns, target


def features_fillna(df, modes, medians):
    """Fill missing values using the same median and mode values"""
    for col in modes:
        df[col] = df[col].fillna(modes[col])

    for col in medians:
        df[col] = df[col].fillna(medians[col])

    return df


def load_model_pipeline(model_pipeline_path):
    """Load preprocessing pipleine and model"""
    baseline_model = joblib.load(model_pipeline_path)