# **Write a program to implement the naïve Bayesian classifier for a sample training data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.** #


In [10]:
import csv
import random
import math

# 1. Load Data from a CSV file
def load_csv(filename):
    """
    Load the CSV file, skipping the header row, and convert all numeric values to floats,
    while keeping the last column (class label) as a string.
    """
    with open(filename, "rt") as file:
        lines = csv.reader(file)
        next(lines)  # Skip the header row
        dataset = []
        for row in lines:
            # Convert all but the last column to float, keep the last column (class label) as string
            dataset.append([float(x) for x in row[:-1]] + [row[-1]]) 
    return dataset



# 2. Split the dataset into training and testing sets randomly
def split_dataset(dataset, split_ratio):
    """
    Split the dataset into training and testing sets based on the split_ratio.
    """
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    copy = list(dataset)
    
    while len(train_set) < train_size:
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))
    
    return [train_set, copy]

# 3. Separate the dataset by class values (last column)
def separate_by_class(dataset):
    """
    Separate the dataset by class, returning a dictionary keyed by class value.
    """
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append(vector)
    return separated

# 4. Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))

# 5. Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

# 6. Summarize the dataset: calculate mean and standard deviation for each attribute
def summarize(dataset):
    """
    Summarize the dataset by calculating the mean and standard deviation for each attribute,
    excluding the last column (class label).
    """
    # Zip the attributes together, excluding the last column (class label)
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)[:-1]]
    return summaries


# 7. Summarize the dataset by class
def summarize_by_class(dataset):
    """
    Separate the dataset by class and then summarize each class.
    """
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, instances in separated.items():
        summaries[class_value] = summarize(instances)
    return summaries

# 8. Calculate Gaussian probability density function
def calculate_probability(x, mean, stdev):
    """
    Calculate the Gaussian probability distribution function for x.
    """
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

# 9. Calculate class probabilities
def calculate_class_probabilities(summaries, input_vector):
    """
    Calculate the probabilities of each class for a given input vector.
    """
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean, stdev = class_summaries[i]
            x = input_vector[i]
            probabilities[class_value] *= calculate_probability(x, mean, stdev)
    return probabilities

# 10. Make a prediction for a given input vector
def predict(summaries, input_vector):
    """
    Predict the class for a given input vector.
    """
    probabilities = calculate_class_probabilities(summaries, input_vector)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# 11. Get predictions for a test set
def get_predictions(summaries, test_set):
    """
    Predict the class for each row in the test set.
    """
    predictions = []
    for row in test_set:
        result = predict(summaries, row)
        predictions.append(result)
    return predictions

# 12. Calculate accuracy percentage
def get_accuracy(test_set, predictions):
    """
    Calculate the accuracy percentage of the predictions.
    """
    correct = 0
    for i in range(len(test_set)):
        if test_set[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(test_set))) * 100.0

# Main script
filename = 'iris.csv'
split_ratio = 0.70
dataset = load_csv(filename)

# Split the dataset into training and testing sets
training_set, test_set = split_dataset(dataset, split_ratio)
print(f'Split {len(dataset)} rows into train={len(training_set)} and test={len(test_set)} rows')

# Prepare model
summaries = summarize_by_class(training_set)

# Test model
predictions = get_predictions(summaries, test_set)
accuracy = get_accuracy(test_set, predictions)
print(f'Accuracy: {accuracy:.2f}%')


Split 150 rows into train=105 and test=45 rows


TypeError: 'zip' object is not subscriptable