<center><H1> Machine Learning Lab #7

<H3>  NAÏVE BAYES CLASSIFIER

In [1]:
#Generic Imports

import math
import re
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter

<H2>Questions

a) Of the students in the college, 60% of the students reside in the hostel and 40% of the students are day
scholars. Previous year results report that 30% of all students who stay in the hostel scored A Grade and 20%
of day scholars scored A grade. At the end of the year, one student is chosen at random and found that he/she
has an A grade. What is the probability that the student is a hosteler?

In [3]:
#Given:

P_hostel = 0.6
P_day_scholar = 0.4

P_A_given_hostel = 0.3
P_A_given_day_scholar = 0.2

#To find: P_hostel_given_A

"""
Applying Bayes Theorem:

P(hostel | A) = P(A|hostel) * P(hostel) / P(A)


To complete this we need P(A):

P(A) = P(A|Hostel) * P(Hostel) + P(A|DayScholar) * P(DayScholar)
P(A) = 0.3 x 0.6 + 0.4 x 0.2
P(A) = 0.26

Continuing:

P(hostel|A) = 0.3 * 0.6 / 0.26
= 0.692
"""
P_A = 0.3 * 0.6 + 0.4 * 0.2
P_hostel_given_A = 0.3 * 0.6 / P_A
P_hostel_given_A = round(P_hostel_given_A, 3)

print(f"Probability of hosteller is {P_hostel_given_A}.")

Probability of hosteller is 0.692.


b) Suppose you're testing for a rare disease, and you have the following information:

1) The disease has a prevalence of 0.01 (1% of the population has the disease).

2) The test is not perfect:

3) The test correctly identifies the disease (true positive) 99% of the time (sensitivity).

4) The test incorrectly indicates the disease (false positive) 2% of the time (1 - specificity).


Calculate the probability of having the disease given a positive test result using Bayes' theorem

In [4]:
p_disease = 0.01
p_pos_given_disease = 0.99  
p_pos_given_no_disease = 0.02

p_no_disease = 1 - p_disease

p_positive = (p_pos_given_disease * p_disease) + (p_pos_given_no_disease * p_no_disease)

numerator = p_pos_given_disease * p_disease
p_disease_given_pos = numerator / p_positive
p_disease_given_pos = round(p_disease_given_pos, 3)

print(f"Probability of disease is {p_disease_given_pos}.")

Probability of disease is 0.333.


2. Develop a function python code for Naïve Bayes classifier from scratch without using scikit-learn library,
to predict whether the buyer should buy computer or not. Consider a following sample training dataset stored
in a CSV file containing information about following buyer conditions (such as “<=30,” “medium,” “Yes,”
and “fair”) and whether the player played golf (“Yes” or “No”).

In [5]:
#Reading from a docstring instead of a csv because I'm lazy like that lol

from io import StringIO

csv_data = """age,income,student,credit_rating,buys_computer
<=30,high,no,fair,no
<=30,high,no,excellent,no
31...40,high,no,fair,yes
>40,medium,no,fair,yes
>40,low,yes,fair,yes
>40,low,yes,excellent,no
31...40,low,yes,excellent,yes
<=30,medium,no,fair,no
<=30,low,yes,fair,yes
>40,medium,yes,fair,yes
<=30,medium,yes,excellent,yes
31...40,medium,no,excellent,yes
31...40,high,yes,fair,yes
>40,medium,no,excellent,no"""

df = pd.read_csv(StringIO(csv_data))

data = df.values.tolist()

features = list(df.columns[:-1])
target_name = df.columns[-1]

print("Data Loaded (first 3 rows):\n", data[:3])

Data Loaded (first 3 rows):
 [['<=30', 'high', 'no', 'fair', 'no'], ['<=30', 'high', 'no', 'excellent', 'no'], ['31...40', 'high', 'no', 'fair', 'yes']]


In [6]:
class NaiveBayesClassifier:
    def __init__(self):
        self.prior_probabilities = {}
        self.likelihoods = {}
        self.classes = []
        self.total_instances = 0
        self.target_index = -1

    def train(self, data):
        self.total_instances = len(data)
        self.target_index = len(data[0]) - 1
        
        class_counts = {}
        for row in data:
            class_label = row[self.target_index]
            class_counts[class_label] = class_counts.get(class_label, 0) + 1
        
        self.classes = list(class_counts.keys())
        
        for class_label, count in class_counts.items():
            self.prior_probabilities[class_label] = count / self.total_instances

        feature_counts = {c: [{} for _ in range(self.target_index)] for c in self.classes}

        for row in data:
            class_label = row[self.target_index]
            for i in range(self.target_index):
                feature_value = row[i]
                
                feature_counts[class_label][i][feature_value] = \
                    feature_counts[class_label][i].get(feature_value, 0) + 1

        for class_label in self.classes:
            self.likelihoods[class_label] = [{} for _ in range(self.target_index)]
            class_total = class_counts[class_label]
            
            for i in range(self.target_index):
                for feature_value, count in feature_counts[class_label][i].items():
                    self.likelihoods[class_label][i][feature_value] = count / class_total
                    
    def predict(self, instance):
        posteriors = {} 
        for class_label in self.classes:
            prior = self.prior_probabilities[class_label]
            
            likelihood = 1.0
            for i in range(len(instance)):
                feature_value = instance[i]
                prob_xi_given_c = self.likelihoods[class_label][i].get(feature_value, 1e-9)
                
                likelihood *= prob_xi_given_c
            
            posteriors[class_label] = prior * likelihood

        best_class = max(posteriors, key=posteriors.get)
        return best_class, posteriors

#Training 
nb_classifier = NaiveBayesClassifier()
nb_classifier.train(data)

In [7]:
test_instance = ["<=30", "medium", "no", "excellent"]
prediction, probabilities = nb_classifier.predict(test_instance)

print(f"\n--- Prediction Result ---")
print(f"Test Instance: {test_instance}")
print(f"Predicted Class: '{prediction}'")
print(f"Posterior Probabilities: {probabilities}")


--- Prediction Result ---
Test Instance: ['<=30', 'medium', 'no', 'excellent']
Predicted Class: 'no'
Posterior Probabilities: {'no': 0.04114285714285714, 'yes': 0.007054673721340387}


3. Write a Python function to implement the Naive Bayes classifier without using the scikit-learn library for the
following sample training dataset stored as a .CSV file.


a. Build a classifier that determines whether a text is about sports or not.

b. Determine which tag the sentence "A very close game" belongs to.

In [8]:
# Sample dataset from the image
data = {
    'Text': ["A great game", "The election was over", "Very clean match", 
             "A clean but forgettable game", "It was a close election"],
    'Tag': ["Sports", "Not sports", "Sports", "Sports", "Not sports"]
}

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)

# Function to preprocess the text (lowercase, remove non-alphanumeric characters)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    return text

# Simplified Naive Bayes function for a single prediction
def naive_bayes_predict(test_example):
    # Calculate prior probabilities
    total = len(df)
    sports_count = len(df[df['Tag'] == 'Sports'])
    not_sports_count = len(df[df['Tag'] == 'Not sports'])
    
    P_sports = sports_count / total
    P_not_sports = not_sports_count / total

    # Preprocess the test example
    test_words = preprocess(test_example).split()

    # Calculate likelihoods for each word given 'Sports' and 'Not sports'
    P_words_given_sports = 1
    P_words_given_not_sports = 1

    for word in test_words:
        P_word_sports = len(df[(df['Text'].str.contains(word)) & (df['Tag'] == 'Sports')]) / sports_count
        P_word_not_sports = len(df[(df['Text'].str.contains(word)) & (df['Tag'] == 'Not sports')]) / not_sports_count

        # Add-1 smoothing to avoid zero probabilities
        P_words_given_sports *= (P_word_sports + 1) / (sports_count + len(test_words))
        P_words_given_not_sports *= (P_word_not_sports + 1) / (not_sports_count + len(test_words))

    # Calculate posterior probabilities
    P_sports_given_text = P_sports * P_words_given_sports
    P_not_sports_given_text = P_not_sports * P_words_given_not_sports

    # Predict the class with the higher posterior probability
    print("P(Not sports|Text):", P_not_sports_given_text)
    print("P(Sports|Text):", P_sports_given_text)
    if P_sports_given_text > P_not_sports_given_text:
        return "Sports"
    else:
        return "Not sports"

# Test example to predict
test_example = "A very close game"

# Prediction
prediction = naive_bayes_predict(test_example)
print("Prediction for test data:", prediction)

P(Not sports|Text): 0.000925925925925926
P(Sports|Text): 0.0008329862557267803
Prediction for test data: Not sports


---