# Bayesian Classification on Telugu Vowel Dataset
### It has 6 classes and 3 features

In [1]:
def read_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            row = list(map(int, line.strip().split()))
            data.append(row)
    return data

data = read_data(r"D:\1st_Year_QMS\2nd_Sem\PR\Vowel_Data_Only.txt")

In [2]:
import numpy as np

def split_data(data, train_ratio=0.8):
    np.random.shuffle(data)
    train_data = []
    test_data = []
    for label in range(1, 7):  # Assuming class labels start from 1
        class_data = [row for row in data if row[0] == label]
        num_train = int(train_ratio * len(class_data))
        train_data.extend(class_data[:num_train])
        test_data.extend(class_data[num_train:])
    return train_data, test_data

train_data, test_data = split_data(data)

In [3]:
def calculate_statistics(data):
    stats = {}
    for label in range(1, 7):
        class_data = np.array([row[1:] for row in data if row[0] == label])
        class_mean = np.mean(class_data, axis=0)
        class_variance = np.var(class_data, axis=0)
        stats[label] = (class_mean, class_variance)
    return stats

class_stats = calculate_statistics(train_data)
class_stats

{1: (array([ 603.50877193, 1453.50877193, 2354.21052632]),
  array([ 6128.03939674, 35163.12711604, 26620.86795937])),
 2: (array([ 697.88732394, 1239.43661972, 2338.73239437]),
  array([ 3551.87462805, 11543.34457449, 27196.98472525])),
 3: (array([ 344.52554745, 2198.54014599, 2806.78832117]),
  array([ 3163.46102616, 31567.21189195, 45677.27635996])),
 4: (array([ 359.58333333,  979.16666667, 2485.83333333]),
  array([ 2095.65972222, 14190.97222222, 34065.97222222])),
 5: (array([ 502.72727273, 1868.66666667, 2621.45454545]),
  array([ 6462.25895317, 80587.31313131, 37302.12672176])),
 6: (array([ 478.81944444, 1049.72222222, 2501.18055556]),
  array([ 3249.30073302, 12497.14506173, 47936.80073302]))}

In [4]:
def gaussian_probability(x, mean, variance):
    exponent = np.exp(-((x - mean) ** 2) / (2 * variance))
    return (1 / np.sqrt(2 * np.pi * variance)) * exponent

def bayesian_classifier(x, class_stats):
    probabilities = {}
    for label, (class_mean, class_variance) in class_stats.items():
        probabilities[label] = np.prod(gaussian_probability(x, class_mean, class_variance))
    return max(probabilities, key=probabilities.get)

def classify_data(data, class_stats):
    classified_data = []
    for row in data:
        x = row[1:]
        label = bayesian_classifier(x, class_stats)
        classified_data.append([label] + x)
    return classified_data

classified_test_data = classify_data(test_data, class_stats)

In [5]:
def calculate_accuracy(test_data, classified_data):
    correct = sum(1 for true, classified in zip(test_data, classified_data) if true[0] == classified[0])
    total = len(test_data)
    accuracy = correct / total
    return accuracy

accuracy = calculate_accuracy(test_data, classified_test_data)
print("Accuracy:", accuracy)

Accuracy: 0.8361581920903954


In [6]:
import pandas as pd
def save_to_excel(test_data, classified_data, accuracy, output_file):
    test_df = pd.DataFrame(test_data, columns=["Actual y", "x1", "x2", "x3"])
    classified_df = pd.DataFrame(classified_data, columns=["Classified y", "x1", "x2", "x3"])
    accuracy_df = pd.DataFrame({"Accuracy": [accuracy]})
    
    with pd.ExcelWriter(output_file) as writer:
        combined_df = pd.concat([test_df, classified_df], axis=1)
        combined_df.to_excel(writer, sheet_name="Classified vs Actual Data", index=False)
        accuracy_df.to_excel(writer, sheet_name="Accuracy Results", index=False)

save_path = r"D:\1st_Year_QMS\2nd_Sem\PR\classification_results_bayesian.xlsx"
save_to_excel(test_data, classified_test_data, accuracy, save_path)

In [14]:
split_list = [0.8, 0.7, 0.6]
for i in split_list:
    tot_acc=0
    runs = 10
    for j in range(runs):
        train_data, test_data = split_data(data, train_ratio = i)
        class_stats = calculate_statistics(train_data)
        #    class_stats
        classified_test_data = classify_data(test_data, class_stats)
        #    classified_test_data
        accuracy = calculate_accuracy(test_data, classified_test_data)
        tot_acc+=accuracy  # Adding accuracies found in each run to get the average.
    avg_acc = tot_acc/runs
    print(f"At {100*i}% training set, accuracy = {round(avg_acc, 4)}")

At 80.0% training set, accuracy = 0.7644
At 70.0% training set, accuracy = 0.7992
At 60.0% training set, accuracy = 0.786
