In [1]:
# Enable code formatting using external plugin: nb_black.
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Naive Bayes classifier

## Objective

1. Implement simple Naive Bayes classifier for categorical features.
2. Train and test model on sample weather forecast dataset using Naive Bayes.

# [1] Setup

### Import and configure required libraries

In [2]:
# Data manipulation libraries
import numpy as np
import pandas as pd

# Data visualization libraries
import prettytable
from prettytable import PrettyTable

# Library versions used in below EDA.
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("PrettyTable version:", prettytable.__version__)

# Configure NumPy.
# Set `Line width` to Maximum 130 characters in the output, post which it will continue in next line.
np.set_printoptions(linewidth=130)

# Configure Pandas.
# Set display width to maximum 130 characters in the output, post which it will continue in next line.
pd.options.display.width = 130

NumPy version: 1.21.5
Pandas version: 1.4.2
PrettyTable version: 3.3.0


<IPython.core.display.Javascript object>

#### Common functions

In [3]:
def is_empty(element) -> bool:
    """
    Function to check if input `element` is empty.

    Other than some special exclusions and inclusions,
    this function returns boolean result of Falsy check.
    """
    if (isinstance(element, int) or isinstance(element, float)) and element == 0:
        # Exclude 0 and 0.0 from the Falsy set.
        return False
    elif isinstance(element, str) and len(element.strip()) == 0:
        # Include string with only one or more empty space(s) into Falsy set.
        return True
    elif isinstance(element, bool):
        # Exclude False from the Falsy set.
        return False
    else:
        # Falsy check.
        return False if element else True


def get_count(items, get_key=lambda item: item):
    """
    Function to count `key` in a list of items.
    """
    count = {}
    for index, item in enumerate(items):
        if is_empty(get_key(item)):
            raise ValueError(f"Specified key not found in the item at index: {index} in the list.")

        count[get_key(item)] = count.get(get_key(item), 0) + 1

    return count.get(True, count)


ENABLE_LOG = False


def log(*args):
    """
    Function print when logging is enabled.
    """
    if ENABLE_LOG is True:
        print(*args)


def title(title_str, padding=[1, 1], line_style="="):
    if ENABLE_LOG is True:
        pad_top, pad_bot = padding
        pt = "\n" * pad_top
        pb = "\n" * pad_bot
        print(pt + title_str + "\n" + line_style * len(title_str) + pb)

<IPython.core.display.Javascript object>

### Load data-points from a `.csv` file

In [4]:
wthr_df = pd.read_csv("./Input/weather_forecast.csv")
wthr_df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


<IPython.core.display.Javascript object>

`DataFrame` metadata:

In [5]:
wthr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     object
 2   Humidity     14 non-null     object
 3   Windy        14 non-null     object
 4   Play         14 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes


<IPython.core.display.Javascript object>

**Observations**

1. Dataset contains 14 rows and 5 columns.
2. All five columns in 14 rows have `non-null` values. No missing data.
3. All the column contain data in string data-type. 
4. All features are categorical.

#### Distinct values for each feature

In [6]:
table = PrettyTable(["Features", "Distinct Values", "Count"], align="l")

for feature in wthr_df.columns[0:-1]:
    unq_vals = wthr_df[feature].unique().tolist()
    table.add_row([feature, ", ".join(unq_vals), len(unq_vals)])

print(table)

+-------------+-----------------------+-------+
| Features    | Distinct Values       | Count |
+-------------+-----------------------+-------+
| Outlook     | Sunny, Overcast, Rain | 3     |
| Temperature | Hot, Mild, Cool       | 3     |
| Humidity    | High, Normal          | 2     |
| Windy       | Weak, Strong          | 2     |
+-------------+-----------------------+-------+


<IPython.core.display.Javascript object>

#### Class label

In [7]:
wthr_df["Play"].value_counts()

Yes    9
No     5
Name: Play, dtype: int64

<IPython.core.display.Javascript object>

# [2] Implement Naive Bayes

In [8]:
def NaiveBayes(df):
    """
    Closure to implement simple Naive Bayes Classifier.
    """
    N, cols = df.shape
    D = cols - 1  # -1 for class label.
    # N => Number of data-points.
    # D => Number of dimensions or features.

    # Split features and class-label.
    f_names = df.columns[0:-1]
    l_name = df.columns[-1]
    labels = df[l_name].value_counts().to_dict()

    # Calculate class-label Prior.
    cl_p = {label: count / N for label, count in labels.items()}

    def fit(alpha=1):
        """
        Train model using every data-point based on Naive Bayes technique.
        """

        def laplace_smoothing(a, b, k=D, sep=False):
            """
            Function to compute Laplace Smoothing.
            """
            return ((a + alpha), (b + (alpha * k))) if sep else ((a + alpha) / (b + (alpha * k)))

        title("Prior:")
        [log(f"P({label}) = {count}/{N}") for label, count in labels.items()]

        # Build Likelihood table.
        title("Likelihood with Laplace Smoothing:", [2, 1])
        model = {}
        for f_name in f_names:
            # For each feature calculate conditional probability.
            title(f'Feature: "{f_name}" & label: {list(labels.keys())}', [0, 0], "-")

            vals = df[[f_name, l_name]].values
            f_count = get_count(vals, lambda item: item[0] + item[1])

            # Calculate probability of feature = `feature` | class-label == `label`.
            for feature in df[f_name].unique().tolist():
                for label, count in labels.items():
                    aib = f_count.get(feature + label, 0)
                    nmtr, dnomtr = laplace_smoothing(aib, count, sep=True)
                    model[(feature, label)] = nmtr / dnomtr
                    log(f"P({f_name} = {feature} | {l_name} == {label}) = {nmtr}/{dnomtr}")
            log()

        log("Model training complete!")

        def predict(x_qs):
            """
            Function to compute Posterior using Prior and Likelihood.
            """
            pis = {}  # Calculate pi per class-label.
            for label, count in labels.items():
                pi = cl_p[label]  # Initialize pi with Prior.
                for x_q in x_qs:
                    # If model receives an unseen feature value, use Laplace Smoothing.
                    pi *= model.get((x_q, label), laplace_smoothing(0, count))
                pis[label] = round(pi, 4)

            log("Pi value of class-labels:", pis)
            return max(pis, key=lambda item: pis[item])

        return predict

    return fit

<IPython.core.display.Javascript object>

# [3] Training

#### Initialize classifier

In [9]:
fit = NaiveBayes(wthr_df)

<IPython.core.display.Javascript object>

Train model using training dataset.

In [10]:
ENABLE_LOG = True
predict = fit()


Prior:

P(Yes) = 9/14
P(No) = 5/14


Likelihood with Laplace Smoothing:

Feature: "Outlook" & label: ['Yes', 'No']
-----------------------------------------
P(Outlook = Sunny | Play == Yes) = 3/13
P(Outlook = Sunny | Play == No) = 4/9
P(Outlook = Overcast | Play == Yes) = 5/13
P(Outlook = Overcast | Play == No) = 1/9
P(Outlook = Rain | Play == Yes) = 4/13
P(Outlook = Rain | Play == No) = 3/9

Feature: "Temperature" & label: ['Yes', 'No']
---------------------------------------------
P(Temperature = Hot | Play == Yes) = 3/13
P(Temperature = Hot | Play == No) = 3/9
P(Temperature = Mild | Play == Yes) = 5/13
P(Temperature = Mild | Play == No) = 3/9
P(Temperature = Cool | Play == Yes) = 4/13
P(Temperature = Cool | Play == No) = 2/9

Feature: "Humidity" & label: ['Yes', 'No']
------------------------------------------
P(Humidity = High | Play == Yes) = 4/13
P(Humidity = High | Play == No) = 5/9
P(Humidity = Normal | Play == Yes) = 7/13
P(Humidity = Normal | Play == No) = 2/9

Feature: "Win

<IPython.core.display.Javascript object>

# [4] Testing

In [11]:
ENABLE_LOG = False

# Features: Outlook, Temperature, Humidity, Windy.
x_q = ["Sunny", "Cool", "High", "Strong"]  # Query point
y_q = predict(x_q)

print("Query point:", x_q)
print("Output:", y_q)

Query point: ['Sunny', 'Cool', 'High', 'Strong']
Output: No


<IPython.core.display.Javascript object>

# [5] Testing Laplace Smoothing (Additive smoothing)

Feature `Humidity` contains only `[High, Normal]` values in training dataset. Pass a new value `Low`, not seen in training dataset to check Laplace Smoothing. 

In [12]:
ENABLE_LOG = True

# Features: Outlook, Temperature, Humidity, Windy.
x_q = ["Sunny", "Cool", "Low", "Strong"]  # Query point
y_q = predict(x_q)

print("Query point:", x_q)
print("Output:", y_q)

Pi value of class-labels: {'Yes': 0.0011, 'No': 0.0017}
Query point: ['Sunny', 'Cool', 'Low', 'Strong']
Output: No


<IPython.core.display.Javascript object>

It is _sunny_, temperature is _cool_, humidity is _low_ but wind is _Strong_ hence we cannot play outside. Now if wind is _weak_ its an ideal situation to play outside, so lets check out model prediction when weather is ("Sunny", "Cool", "Low", "Weak"):

In [13]:
# Features: Outlook, Temperature, Humidity, Windy.
x_q = ["Sunny", "Cool", "Low", "Weak"]  # Query point
y_q = predict(x_q)

print("Query point:", x_q)
print("Output:", y_q)

Pi value of class-labels: {'Yes': 0.0019, 'No': 0.0013}
Query point: ['Sunny', 'Cool', 'Low', 'Weak']
Output: Yes


<IPython.core.display.Javascript object>

Well indeed its an ideal weather to play outside!