# k Nearest Neighbors Classification of Telugu Vowel Dataset

In [1]:
def read_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            row = list(map(int, line.strip().split()))
            data.append(row)
    return data

data = read_data(r"D:\1st_Year_QMS\2nd_Sem\PR\Vowel_Data_Only.txt")

In [2]:
import numpy as np

def split_data(data, train_ratio=0.8):
    np.random.shuffle(data)
    train_data = []
    test_data = []
    for label in range(1, 7):  # Assuming class labels start from 1
        class_data = [row for row in data if row[0] == label]
        num_train = int(train_ratio * len(class_data))
        train_data.extend(class_data[:num_train])
        test_data.extend(class_data[num_train:])
    return train_data, test_data

train_data, test_data = split_data(data)

In [3]:
import numpy as np

# Function to calculate Euclidean distance between two points
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((np.array(x1) - np.array(x2)) ** 2))

# Function to classify a single data point using KNN
def knn_classify(x, train_data, k):
    distances = [(row, euclidean_distance(x, row[1:])) for row in train_data]
    sorted_distances = sorted(distances, key=lambda x: x[1])
    k_nearest_neighbors = sorted_distances[:k]
    labels = [neighbor[0][0] for neighbor in k_nearest_neighbors]
    return max(set(labels), key=labels.count)

# Function to classify the entire test dataset using KNN
def classify_data(test_data, train_data, k):
    classified_data = []
    for row in test_data:
        x = row[1:]
        label = knn_classify(x, train_data, k)
        classified_data.append([label] + x)
    return classified_data

# Calculate accuracy
def calculate_accuracy(test_data, classified_data):
    correct = sum(1 for true, classified in zip(test_data, classified_data) if true[0] == classified[0])
    total = len(test_data)
    accuracy = correct / total
    return accuracy

# Calculate distances and classify test data
list_k = [2, 5]
for k in list_k:
    classified_test_data = classify_data(test_data, train_data, k)
    accuracy = calculate_accuracy(test_data, classified_test_data)
    print(f"At k = {k}, Accuracy = {accuracy}")

At k = 2, Accuracy = 0.7627118644067796
At k = 5, Accuracy = 0.8870056497175142


In [4]:
# Save results to Excel
import pandas as pd

def save_to_excel(test_data, classified_data, accuracy, output_file):
    test_df = pd.DataFrame(test_data, columns=["Actual y", "x1", "x2", "x3"])
    classified_df = pd.DataFrame(classified_data, columns=["Classified y", "x1", "x2", "x3"])
    accuracy_df = pd.DataFrame({"Accuracy": [accuracy]})
    
    with pd.ExcelWriter(output_file) as writer:
        combined_df = pd.concat([test_df, classified_df], axis=1)
        combined_df.to_excel(writer, sheet_name="Classified vs Actual Data", index=False)
        accuracy_df.to_excel(writer, sheet_name="Accuracy Results", index=False)

save_path = r"D:\1st_Year_QMS\2nd_Sem\PR\classification_results_knn.xlsx"
save_to_excel(test_data, classified_test_data, accuracy, save_path)

In [6]:
split_list = [0.8, 0.7, 0.6]
k_list = [2, 3, 4, 5, 6, 7, 8, 9, 10]

for i in split_list:
    for k in k_list:
        tot_acc=0
        runs = 10
        for j in range(10):
            train_data, test_data = split_data(data, train_ratio=i)
            classified_test_data = classify_data(test_data, train_data, k)
            accuracy = calculate_accuracy(test_data, classified_test_data)
            tot_acc+=accuracy  # Adding accuracies found in each run to get the average.
        avg_acc = tot_acc/runs
        print(f"At {100*i}% training set and k = {k}, accuracy = {round(avg_acc, 4)}")

At 80.0% training set and k = 2, accuracy = 0.7729
At 80.0% training set and k = 3, accuracy = 0.8554
At 80.0% training set and k = 4, accuracy = 0.8588
At 80.0% training set and k = 5, accuracy = 0.8599
At 80.0% training set and k = 6, accuracy = 0.8655
At 80.0% training set and k = 7, accuracy = 0.8469
At 80.0% training set and k = 8, accuracy = 0.8531
At 80.0% training set and k = 9, accuracy = 0.8446
At 80.0% training set and k = 10, accuracy = 0.835
At 70.0% training set and k = 2, accuracy = 0.7898
At 70.0% training set and k = 3, accuracy = 0.8487
At 70.0% training set and k = 4, accuracy = 0.8472
At 70.0% training set and k = 5, accuracy = 0.8483
At 70.0% training set and k = 6, accuracy = 0.8472
At 70.0% training set and k = 7, accuracy = 0.8438
At 70.0% training set and k = 8, accuracy = 0.8392
At 70.0% training set and k = 9, accuracy = 0.8498
At 70.0% training set and k = 10, accuracy = 0.8498
At 60.0% training set and k = 2, accuracy = 0.7934
At 60.0% training set and k = 