# Minimum Distance Classification of Telugu Vowel Dataset
### It has 6 classes and 3 features

In [1]:
def read_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            row = list(map(int, line.strip().split()))
            data.append(row)
    return data

data = read_data(r"D:\1st_Year_QMS\2nd_Sem\PR\Vowel_Data_Only.txt")

In [2]:
import numpy as np

def split_data(data, train_ratio=0.8):
    np.random.shuffle(data)
    train_data = []
    test_data = []
    for label in range(1, 7):  # Assuming class labels start from 1
        class_data = [row for row in data if row[0] == label]
        num_train = int(train_ratio * len(class_data))
        train_data.extend(class_data[:num_train])
        test_data.extend(class_data[num_train:])
    return train_data, test_data

train_data, test_data = split_data(data)

In [3]:
def calculate_centroids(data):
    centroids = {}
    for label in range(1, 7):
        class_data = np.array([row[1:] for row in data if row[0] == label])
        class_centroid = np.mean(class_data, axis=0)
        centroids[label] = class_centroid
    return centroids

class_centroids = calculate_centroids(train_data)
class_centroids

{1: array([ 607.01754386, 1457.89473684, 2375.61403509]),
 2: array([ 704.92957746, 1241.54929577, 2340.84507042]),
 3: array([ 340.58394161, 2200.72992701, 2805.40145985]),
 4: array([ 357.33333333,  968.33333333, 2498.66666667]),
 5: array([ 504.84848485, 1843.21212121, 2612.06060606]),
 6: array([ 485.76388889, 1060.48611111, 2503.26388889])}

In [4]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def minimum_distance_classifier(x, centroids):
    min_distance = float('inf')
    min_label = None
    for label, centroid in centroids.items():
        distance = euclidean_distance(x, centroid)
        if distance < min_distance:
            min_distance = distance
            min_label = label
    return min_label

def classify_data(data, centroids):
    classified_data = []
    for row in data:
        x = row[1:]
        label = minimum_distance_classifier(x, centroids)
        classified_data.append([label] + x)
    return classified_data

classified_test_data = classify_data(test_data, class_centroids)

In [5]:
def calculate_accuracy(test_data, classified_data):
    correct = sum(1 for true, classified in zip(test_data, classified_data) if true[0] == classified[0])
    total = len(test_data)
    accuracy = correct / total
    return accuracy

accuracy = calculate_accuracy(test_data, classified_test_data)
print("Accuracy:", accuracy)

Accuracy: 0.6779661016949152


In [6]:
import pandas as pd

def save_to_excel(test_data, classified_data, accuracy, output_file):
    test_df = pd.DataFrame(test_data, columns=["Actual y", "x1", "x2", "x3"])
    classified_df = pd.DataFrame(classified_data, columns=["Classified y", "x1", "x2", "x3"])
    accuracy_df = pd.DataFrame({"Accuracy": [accuracy]})
    
    with pd.ExcelWriter(output_file) as writer:
        combined_df = pd.concat([test_df, classified_df], axis=1)
        combined_df.to_excel(writer, sheet_name="Classified vs Actual Data", index=False)
        accuracy_df.to_excel(writer, sheet_name="Accuracy Results", index=False)

save_path = r"D:\1st_Year_QMS\2nd_Sem\PR\classification_results_min_distance.xlsx"
save_to_excel(test_data, classified_test_data, accuracy, save_path)

In [7]:
split_list = [0.8, 0.7, 0.6]

for i in split_list:
    tot_acc=0
    runs = 10
    for j in range(runs):
        train_data, test_data = split_data(data, train_ratio=i)
        class_centroids = calculate_centroids(train_data)
        classified_test_data = classify_data(test_data, class_centroids)
        accuracy = calculate_accuracy(test_data, classified_test_data)
        tot_acc+=accuracy  # Adding accuracies found in each run to get the average.
    avg_acc = tot_acc/runs
    print(f"At {100*i}% training set, accuracy = {round(avg_acc, 4)}")

At 80.0% training set, accuracy = 0.7147
At 70.0% training set, accuracy = 0.7087
At 60.0% training set, accuracy = 0.692
