5)Aim: Demonstrate the text classifier using Naïve bayes classifier algorithm. Program: Write a program to implement the naive Bayesian classifier for a sample training data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict

class NaiveBayesClassifier:
    def __init__(self):
        self.priors = {}
        self.likelihoods = defaultdict(dict)

    def fit(self, X, y):
        self.classes = np.unique(y)
        total_samples = len(y)

        for cls in self.classes:
            X_cls = X[y == cls]
            self.priors[cls] = len(X_cls) / total_samples

            for column in X.columns:
                self.likelihoods[column][cls] = X_cls[column].value_counts(normalize=True).to_dict()

    def predict(self, X):
        results = []

        for i in range(len(X)):
            posteriors = {}

            for cls in self.classes:
                prior = np.log(self.priors[cls])
                likelihood = sum(
                    np.log(self.likelihoods[col].get(cls, {}).get(X.iloc[i][col], 1e-6))
                    for col in X.columns
                )
                posteriors[cls] = prior + likelihood

            results.append(max(posteriors, key=posteriors.get))

        return results

    def accuracy(self, y_true, y_pred):
        return np.mean(np.array(y_true) == np.array(y_pred))

# Load data from CSV file
data = pd.read_csv('/kaggle/input/iris-dataset/Iris_data_sample.csv')

# Inspect data for inconsistencies
print(data.info())
print(data.head())

# Check for NaN values and data types in target column
print(data.iloc[:, -1].unique())

# Separate features and target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Clean target variable to ensure consistent data type
y = y.astype(str)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes Classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Compute the accuracy
accuracy = nb_classifier.accuracy(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     150 non-null    int64  
 1   SepalLengthCm  150 non-null    object 
 2   SepalWidthCm   149 non-null    float64
 3   PetalLengthCm  149 non-null    object 
 4   PetalWidthCm   150 non-null    float64
 5   Species        149 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 7.2+ KB
None
   Unnamed: 0 SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm  \
0           1           5.1           3.5           1.4           0.2   
1           2           4.9           NaN           1.4           0.2   
2           3           4.7           3.2           1.3           0.2   
3           4            ??           3.1           1.5           0.2   
4           5             5           3.6           ###           0.2   

       Species  
0  Iris-setosa  
1   