In [1]:
# Enable code formatting using external plugin: nb_black.
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Naive Bayes Classifier - CategoricalNB

## Objective

1. Train Naive Bayes classifier using `CategoricalNB` from `sklearn`.
2. Train and test model on sample weather forecast dataset.
3. Compare the result with the [custom implementation of Naive Bayes classifier][1].

[1]: https://github.com/DheemanthBhat/ML-Concepts/blob/main/4.%20Naive%20Bayes/Naive%20Bayes%20Classifier.ipynb

# [1] Setup

### Import and configure required libraries

In [2]:
# Data manipulation libraries
import pandas as pd

# Data visualization libraries
import prettytable
from prettytable import PrettyTable

# Data modeling libraries
import sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

# Library versions used in below EDA.
print("Pandas version:", pd.__version__)
print("PrettyTable version:", prettytable.__version__)
print("Sklearn version:", sklearn.__version__)

# Configure Pandas.
# Set display width to maximum 130 characters in the output, post which it will continue in next line.
pd.options.display.width = 130

Pandas version: 1.4.2
PrettyTable version: 3.3.0
Sklearn version: 1.0.2


<IPython.core.display.Javascript object>

# [2] Load Dataset

In [3]:
wthr_df = pd.read_csv("./Input/weather_forecast.csv")
wthr_df.head(14)

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


<IPython.core.display.Javascript object>

In [4]:
wthr_df["Play"].value_counts()

Yes    9
No     5
Name: Play, dtype: int64

<IPython.core.display.Javascript object>

Imbalance dataset.

# [3] Transform Data

#### Up-sampling for balanced dataset

In [5]:
# Filter to fetch all minority class values.
fltr = wthr_df["Play"] == "No"

# Apply filter and slice out last four rows.
last_four_rows = wthr_df.loc[fltr][-4:]

# Append last four rows of original DataFrame into a new DataFrame.
wthr_dfb = pd.concat([wthr_df, last_four_rows], ignore_index=True)

# Sort by `Play` column to check the concatenation.
wthr_dfb.sort_values(by=["Play"], ascending=False)

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
9,Rain,Mild,Normal,Weak,Yes
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
6,Overcast,Cool,Normal,Strong,Yes
8,Sunny,Cool,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes
11,Overcast,Mild,High,Strong,Yes
12,Overcast,Hot,Normal,Weak,Yes
13,Rain,Mild,High,Strong,No


<IPython.core.display.Javascript object>

In [6]:
wthr_dfb["Play"].value_counts()

No     9
Yes    9
Name: Play, dtype: int64

<IPython.core.display.Javascript object>

Now the dataset is balanced!

#### Encoding Categorical features

Since the data in all the features are in sting format, encode it to integers using `OrdinalEncoder`.

In [7]:
X_train = wthr_dfb[["Outlook", "Temperature", "Humidity", "Windy"]]
y_train = wthr_dfb["Play"]

enc = OrdinalEncoder()
X_train = enc.fit_transform(X_train, y_train)

print("Encoded features:")
table = PrettyTable(enc.feature_names_in_.tolist())
[table.add_row([*row]) for idx, row in enumerate(X_train)]

print(table)

Encoded features:
+---------+-------------+----------+-------+
| Outlook | Temperature | Humidity | Windy |
+---------+-------------+----------+-------+
|   2.0   |     1.0     |   0.0    |  1.0  |
|   2.0   |     1.0     |   0.0    |  0.0  |
|   0.0   |     1.0     |   0.0    |  1.0  |
|   1.0   |     2.0     |   0.0    |  1.0  |
|   1.0   |     0.0     |   1.0    |  1.0  |
|   1.0   |     0.0     |   1.0    |  0.0  |
|   0.0   |     0.0     |   1.0    |  0.0  |
|   2.0   |     2.0     |   0.0    |  1.0  |
|   2.0   |     0.0     |   1.0    |  1.0  |
|   1.0   |     2.0     |   1.0    |  1.0  |
|   2.0   |     2.0     |   1.0    |  0.0  |
|   0.0   |     2.0     |   0.0    |  0.0  |
|   0.0   |     1.0     |   1.0    |  1.0  |
|   1.0   |     2.0     |   0.0    |  0.0  |
|   2.0   |     1.0     |   0.0    |  0.0  |
|   1.0   |     0.0     |   1.0    |  0.0  |
|   2.0   |     2.0     |   0.0    |  1.0  |
|   1.0   |     2.0     |   0.0    |  0.0  |
+---------+-------------+----------+-

<IPython.core.display.Javascript object>

# [4] Train `CategoricalNB` Model

Train a Naive Bayes Classifier model using `CategoricalNB` class from `scikit-learn` on encoded training data.

In [8]:
nb_clfr = CategoricalNB(min_categories=[3, 3, 3, 2])
model = nb_clfr.fit(X_train, y_train)

<IPython.core.display.Javascript object>

#### Display Prior

In [9]:
cl1, cl2 = nb_clfr.classes_
cl1_prior, cl2_prior = nb_clfr.class_log_prior_

print("Prior:")
print(cl1, "=", cl1_prior)
print(cl2, "=", cl2_prior)

Prior:
No = -0.693147180559945
Yes = -0.693147180559945


<IPython.core.display.Javascript object>

#### Display Likelihood

Display likelihood computed by `CategoricalNB` model in tabular format.

In [10]:
# Possible categories.
categories = [
    ["Overcast", "Rain", "Sunny"],
    ["Cool", "Hot", "Mild"],
    ["High", "Normal", "Low"],
    ["Strong", "Weak"],
]

ft_names = enc.feature_names_in_.tolist()  # Feature names
lhood_table = {}
table = PrettyTable(["Feature name", "Feature value", "Class label", "Log Probability"], align="r")

for ft_idx, ft_name in enumerate(ft_names):  # Feature index, Feature
    for cl_idx, cl_label in enumerate(nb_clfr.classes_):  # Class-label index, Class-label
        for cat_idx, cat_name in enumerate(categories[ft_idx]):  # Category index, Category
            lp = round(nb_clfr.feature_log_prob_[ft_idx][cl_idx][cat_idx], 4)
            table.add_row([ft_name, cat_name, cl_label, lp])
            lhood_table[(ft_name, cat_name, cl_label)] = lp

print("Likelihood table:")
print(table)

Likelihood table:
+--------------+---------------+-------------+-----------------+
| Feature name | Feature value | Class label | Log Probability |
+--------------+---------------+-------------+-----------------+
|      Outlook |      Overcast |          No |         -2.4849 |
|      Outlook |          Rain |          No |         -0.8755 |
|      Outlook |         Sunny |          No |         -0.6931 |
|      Outlook |      Overcast |         Yes |         -0.8755 |
|      Outlook |          Rain |         Yes |         -1.0986 |
|      Outlook |         Sunny |         Yes |         -1.3863 |
|  Temperature |          Cool |          No |         -1.3863 |
|  Temperature |           Hot |          No |         -1.0986 |
|  Temperature |          Mild |          No |         -0.8755 |
|  Temperature |          Cool |         Yes |         -1.0986 |
|  Temperature |           Hot |         Yes |         -1.3863 |
|  Temperature |          Mild |         Yes |         -0.8755 |
|     H

<IPython.core.display.Javascript object>

# [5] Test Model

In [11]:
def predict(x_q):
    """
    Function to predict class-label using given query point `x_q`.
    """
    print("Query Point:", x_q)
    print("Output:", model.predict([x_q]).item())


def get_log_prb(x_q):
    """
    Function to fetch log-pobability of each class-label for the given query point `x_q`.
    """
    cl_sigmas = []  # Sigma per class-label.
    for cl_idx, cl_label in enumerate(nb_clfr.classes_):  # Class-label index, Class-label
        lp_sum = nb_clfr.class_log_prior_[cl_idx]
        for idx, f_val in enumerate(x_q):
            lp_sum += lhood_table[(ft_names[idx], f_val, cl_label)]
        cl_sigmas.append(round(lp_sum, 4))

    no_lp, yes_lp = cl_sigmas
    print(f"P(Play = No | {x_q}) = {no_lp}")
    print(f"P(Play = Yes | {x_q}) = {yes_lp}")

<IPython.core.display.Javascript object>

In [12]:
# Features: Outlook, Temperature, Humidity, Windy.
# query point: 'Sunny', 'Cool', 'High', 'Strong'.
predict([2, 0, 0, 0])
get_log_prb(["Sunny", "Cool", "High", "Strong"])

Query Point: [2, 0, 0, 0]
Output: No
P(Play = No | ['Sunny', 'Cool', 'High', 'Strong']) = -3.63
P(Play = Yes | ['Sunny', 'Cool', 'High', 'Strong']) = -5.2882


<IPython.core.display.Javascript object>

# [6] Testing Additive Smoothing

In [13]:
# Features: Outlook, Temperature, Humidity, Windy.
# query point: 'Sunny', 'Cool', 'Low', 'Strong'.
predict([2, 0, 2, 0])
get_log_prb(["Sunny", "Cool", "Low", "Strong"])

Query Point: [2, 0, 2, 0]
Output: No
P(Play = No | ['Sunny', 'Cool', 'Low', 'Strong']) = -5.7094
P(Play = Yes | ['Sunny', 'Cool', 'Low', 'Strong']) = -6.6745


<IPython.core.display.Javascript object>

In [14]:
# Features: Outlook, Temperature, Humidity, Windy.
# query point: 'Sunny', 'Cool', 'Low', 'Weak'.
predict([2, 0, 2, 1])
get_log_prb(["Sunny", "Cool", "Low", "Weak"])

Query Point: [2, 0, 2, 1]
Output: Yes
P(Play = No | ['Sunny', 'Cool', 'Low', 'Weak']) = -6.269
P(Play = Yes | ['Sunny', 'Cool', 'Low', 'Weak']) = -6.1149


<IPython.core.display.Javascript object>