In [1]:
# Enable code formatting using external plugin: nb_black.
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Naive Bayes

# [1] Setup

### Import and configure required libraries

In [2]:
# Data manipulation libraries
import numpy as np
import pandas as pd

# Library versions used in below EDA.
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

# Configure NumPy.
# Set `Line width` to Maximum 130 characters in the output, post which it will continue in next line.
np.set_printoptions(linewidth=130)

# Configure Pandas.
# Set display width to maximum 130 characters in the output, post which it will continue in next line.
pd.options.display.width = 130

NumPy version: 1.21.5
Pandas version: 1.4.2


<IPython.core.display.Javascript object>

#### Common functions

In [3]:
def is_empty(element) -> bool:
    """
    Function to check if input `element` is empty.

    Other than some special exclusions and inclusions,
    this function returns boolean result of Falsy check.
    """
    if (isinstance(element, int) or isinstance(element, float)) and element == 0:
        # Exclude 0 and 0.0 from the Falsy set.
        return False
    elif isinstance(element, str) and len(element.strip()) == 0:
        # Include string with only one or more empty space(s) into Falsy set.
        return True
    elif isinstance(element, bool):
        # Exclude False from the Falsy set.
        return False
    else:
        # Falsy check.
        return False if element else True


def get_count(items, get_key=lambda item: item):
    """
    Function to count `key` in a list of items.
    """
    count = {}
    for index, item in enumerate(items):
        if is_empty(get_key(item)):
            raise ValueError(f"Specified key not found in the item at index: {index} in the list.")

        count[get_key(item)] = count.get(get_key(item), 0) + 1

    return count.get(True, count)


ENABLE_LOG = False


def log(*args):
    """
    Function print when logging is enabled.
    """
    if ENABLE_LOG is True:
        print(*args)


def title(title_str):
    if ENABLE_LOG is True:
        print("\n" + title_str + "\n" + "=" * len(title_str) + "\n")

<IPython.core.display.Javascript object>

### Load data-points from a `.csv` file

In [4]:
wthr_df = pd.read_csv("./Input/weather_forecast.csv")
wthr_df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


<IPython.core.display.Javascript object>

`DataFrame` metadata:

In [5]:
wthr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     object
 2   Humidity     14 non-null     object
 3   Windy        14 non-null     object
 4   Play         14 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes


<IPython.core.display.Javascript object>

**Observations**

1. Dataset contains 14 rows and 5 columns.
2. All five columns in 14 rows have `non-null` values. No missing data.
3. All the column contain data in string data-type. 
4. All features are categorical.

Class labels:

In [6]:
wthr_df["Play"].value_counts()

Yes    9
No     5
Name: Play, dtype: int64

<IPython.core.display.Javascript object>

# [2] Implement Naive Bayes

In [7]:
def NaiveBayes(df):
    """
    Closure to implement simple Naive Bayes Classifier.
    """
    # Split features and class-label.
    f_names = df.columns[0:-1]
    l_name = df.columns[-1]
    labels = df[l_name].value_counts().to_dict()

    # Calculate Prior.
    lp = {key: value / len(df[l_name]) for key, value in labels.items()}

    # Build Likelihood table.
    likelihood = {}
    for f_name in f_names:
        # For each feature calculate conditional probability.
        vals = df[[f_name, l_name]].values
        feq = get_count(vals, lambda item: item[0] + item[1])
        likelihood[f_name] = feq

    def fit():
        """
        Train model based on Naive Bayes technique.
        """

        title("Prior:")
        [log(f"P({key}) = {value}/{len(df[l_name])}") for key, value in labels.items()]

        title("Likelihood:")
        model = {}
        for f_name in f_names:
            log("Feature:", f_name)

            features = df[f_name].value_counts().to_dict()
            for feature, _ in features.items():
                cols = []
                for key, value in labels.items():
                    aib = likelihood[f_name].get(feature + key, 0)
                    cols.append(f"P({f_name} = {feature} | {l_name} == {key}) = {aib}/{value}")
                    model[(feature, key)] = aib / value
                log(", ".join(cols))
            log("-" * 85)

        def predict(x_qs):
            """
            Function to compute Posterior using Prior and Likelihood.
            """
            pi = {}
            for key, value in labels.items():
                prod = lp[key]
                for x_q in x_qs:
                    prod *= model[(x_q, key)]
                pi[key] = round(prod, 4)

            return max(pi, key=lambda item: pi[item])

        return predict

    return fit

<IPython.core.display.Javascript object>

#### Initialize classifier

In [8]:
fit = NaiveBayes(wthr_df)

<IPython.core.display.Javascript object>

# [3] Training

Train model using training dataset.

In [9]:
ENABLE_LOG = True
predict = fit()


Prior:

P(Yes) = 9/14
P(No) = 5/14

Likelihood:

Feature: Outlook
P(Outlook = Sunny | Play == Yes) = 2/9, P(Outlook = Sunny | Play == No) = 3/5
P(Outlook = Rain | Play == Yes) = 3/9, P(Outlook = Rain | Play == No) = 2/5
P(Outlook = Overcast | Play == Yes) = 4/9, P(Outlook = Overcast | Play == No) = 0/5
-------------------------------------------------------------------------------------
Feature: Temperature
P(Temperature = Mild | Play == Yes) = 4/9, P(Temperature = Mild | Play == No) = 2/5
P(Temperature = Hot | Play == Yes) = 2/9, P(Temperature = Hot | Play == No) = 2/5
P(Temperature = Cool | Play == Yes) = 3/9, P(Temperature = Cool | Play == No) = 1/5
-------------------------------------------------------------------------------------
Feature: Humidity
P(Humidity = High | Play == Yes) = 3/9, P(Humidity = High | Play == No) = 4/5
P(Humidity = Normal | Play == Yes) = 6/9, P(Humidity = Normal | Play == No) = 1/5
--------------------------------------------------------------------------

<IPython.core.display.Javascript object>

# [4] Testing

In [10]:
x_q = ["Sunny", "Cool", "High", "Strong"]  # Query point
y_q = predict(x_q)

print("Query point:", x_q)
print("Output:", y_q)

Query point: ['Sunny', 'Cool', 'High', 'Strong']
Output: No


<IPython.core.display.Javascript object>