In [1]:
#Import scipy.spatial library to get distance methods
from scipy.spatial import distance

#Method to get the euclidean distance between two points
def euclidean(first_point, second_point):
    return distance.euclidean(first_point, second_point)

def mahalanobis(point1, point2, trainingSet):
    VI = np.linalg.inv(covMatrix)
    return distance.mahalanobis(point1, point2, VI)

#Method to get the cosine distance between two points
def cosine(first_point, second_point):
    return distance.cosine(first_point, second_point)

In [2]:
#Import numpy as np for mathematical manipulation, used throughout program
import numpy as np

#Generate class A data
N = 500
mean = [1,1]
cov = [[0.3, 0.2],[0.2, 0.2]]
A_data = np.random.multivariate_normal(mean, cov, N)

Class_a = []
for i in range(len(A_data)):
    Class_a.append(np.append(A_data[i],[0]))
Class_a = np.asarray(Class_a)

total = [0,0]
for i in A_data:
    total = total + i
average = total / len(A_data)
print("distribution average - A data:", average)

distribution average - A data: [1.03719019 1.03763461]


In [3]:
#Generate class B data
N = 500
mean = [3,4]
cov = [[0.3, 0],[0, 0.2]]
B_data = np.random.multivariate_normal(mean, cov, N)

Class_b = []
for i in range(len(B_data)):
    Class_b.append(np.append(B_data[i],[1]))
Class_b = np.asarray(Class_b)

total = [0,0]
for i in B_data:
    total = total + i
average = total / len(B_data)
print("distribution average - B data:", average)

distribution average - B data: [2.98434661 4.01680581]


In [4]:
#Generate unknown data
N = 300
mean = [2,3]
cov = [[0.3, 0],[0, 0.2]]
unknown_data = np.random.multivariate_normal(mean, cov, N)

total = [0,0]
for i in unknown_data:
    total = total + i
average = total / len(unknown_data)
print("distribution average - unknown data:", average)

distribution average - unknown data: [2.06104643 2.97067293]


In [5]:
from copy import deepcopy
import matplotlib.pyplot as plt

#Set random seed so it is testable
np.random.seed(1) 

#Count the neighbors
def count_neighbors(data):
    neighbor_count = [0,0]
    for n in data:
        if n[2] == 0:
            neighbor_count[0] = neighbor_count[0] +1
        elif n[2] == 1:
            neighbor_count[1] = neighbor_count[1] +1
    if neighbor_count[0] > neighbor_count[1]:
        return 0
    else:
        return 1
    
#Get the knn
def k_nearest_neighbor(k, training_data, data_point):
    neighbors = deepcopy(training_data[:k])
    for i in range(k, len(training_data)):
        for j in range(len(neighbors)):
            first_distance = distance.euclidean(training_data[i][:2], data_point)
            second_distance = distance.euclidean(neighbors[j][:2], data_point)
            if first_distance < second_distance:
                neighbors[j] = deepcopy(training_data[i])
                break
    return neighbors
    
#Evaluate the unknown points
def evaluate_data(k, training_data, unknown_data):
    classified_data = []
    for i in range(len(unknown_data)):
        data_point = unknown_data[i]
        knn_value = k_nearest_neighbor(k, training_data, data_point)
        classified_datapoints = count_neighbors(knn_value)
        classified_data.append(np.append(data_point, classified_datapoints))
    return np.asarray(classified_data)

training_data = np.append(Class_a, Class_b, axis=0)

print("Evaluate using k=1...")
unknown_k1 = evaluate_data(1, training_data, unknown_data)

print("Evaluate using k=30...")
unknown_k30 = evaluate_data(30, training_data, unknown_data)

def plot_knn(class1,class2, classifiedData, k):
    predicted1_samples = []
    predicted2_samples = []
    for data_point in classifiedData:
        if data_point[2] == 0:
            predicted1_samples.append(data_point)
        else:
            predicted2_samples.append(data_point)
    predicted1_samples = np.asarray(predicted1_samples)
    predicted2_samples = np.asarray(predicted2_samples)
    print("Predicted 1 samples length: ", len(predicted1_samples))
    print("Predicted 2 samples length: ", len(predicted2_samples))
    fig_output = "k = "+str(k)
    fig_title = fig_output + " Euclidean Distance Classification"
    fig = plt.figure()
    plt.plot(class1[:, 0], class1[:, 1], 'b.', label='given class a')
    plt.plot(class2[:, 0], class2[:, 1], 'r.', label='given class b')
    plt.plot(predicted1_samples[:, 0], predicted1_samples[:, 1], 'g*', label='predicted class a')
    plt.plot(predicted2_samples[:, 0], predicted2_samples[:, 1], '*', color='orange', label='predicated class b')
    plt.xlabel('X-axis')
    plt.ylabel('Y-Axis')
    plt.title(fig_title)
    plt.tight_layout()
    plt.grid(True, lw=0.5)
    plt.legend()
    fig.savefig(fig_output)

plot_knn(Class_a, Class_b, unknown_k1, 1) #Number of folds
plot_knn(Class_a, Class_b, unknown_k30, 30) #Number of folds

Evaluate using k=1...
Evaluate using k=30...
Predicted 1 samples length:  65
Predicted 2 samples length:  235
Predicted 1 samples length:  24
Predicted 2 samples length:  276


In [6]:
#IRIS CLASSIFIER

In [7]:
import math
import csv

iris_location = 4

#Read file and put into a numpy array
def read_file(): 
    data = []
    with open("iris.data", "r") as file:
        reader = csv.reader(file)
        for row in reader:
            classification = row[iris_location]
            if(classification == Setosa.string_value):
                row[iris_location] = 1
            elif(classification == Versicolor.string_value):
                row[iris_location] = 2
            elif(classification == Virginica.string_value):
                row[iris_location] = 3
            for i in range(len(row)):
                row[i] = float(row[i])
            data.append(row)
    return np.asarray(data)

#Shuffle and randomize the data
def shuffle_data(data):
    np.random.seed(1)
    np.random.shuffle(data)
    return data

#Split testing and training data, ratio of 80/20
def split_data(data):
    number_datapoints = math.floor(len(data) * 0.20)
    split_data_values = np.split(data, [number_datapoints])
    return split_data_values

#Perform a k-split on the data
def k_split(data, k):
    k_split_values = np.array_split(data, k)
    return k_split_values

In [8]:
#Create an iris class to properly track the metrics
class iris_class:
    def __init__(self):
        self.true_positives = 0
        self.true_negatives = 0
        self.false_positives = 0
        self.false_negatives = 0
    def accuracy(self):
        total = self.true_positives + self.true_negatives + self.false_positives + self.false_negatives
        accuracy = (self.true_positives + self.true_negatives) / (total)
        return accuracy
    def recall(self):
        return self.true_positives / (self.true_positives + self.false_negatives)
    def precision(self):
        return self.true_positives / (self.true_positives + self.false_positives)
    def f1score(self):
        p = self.true_positives / (self.true_positives + self.false_positives)
        r = self.true_positives / (self.true_positives + self.false_negatives)
        f1 = 2 * ((p * r) / (p + r))
        return f1
    def performance(self):
        print("Iris type: ", self.string_value)
        print("Accuracy: ",  self.accuracy())
        print("Recall: ",    self.recall())
        print("Precision: ", self.precision())
        print("F1 Score: ",   self.f1score())

#Create class for Iris-setosa
class Setosa(iris_class):
    integer_value = 1
    string_value = "Iris-setosa"
    
#Create class for Iris-versicolor
class Versicolor(iris_class):
    integer_value = 2
    string_value = "Iris-versicolor"

#Create class for Iris-virginica
class Virginica(iris_class):
    integer_value = 3
    string_value = "Iris-virginica"
    
#Create class to store the data
class data:
    full_dataset = shuffle_data(read_file())
    testing_data, training_data = split_data(full_dataset)
    data_with_k_split = k_split(training_data, 5) #Number of folds

In [9]:
#Perform cross validation on k
def crossValidation(k, distance_type):
    folds = data.data_with_k_split
    for i in range(len(folds)):
        testing_data_folds = deepcopy(folds[i])
        folds_removed  = np.delete(deepcopy(folds), i, 0)
        training_data = np.concatenate(folds_removed)
        print("--------------------------------------------")
        print("------------- Initiate Fold ----------------")
        print("--------------------------------------------")
        evaluate(k, training_data, testing_data_folds, distance_type)
        print("--------------------------------------------")
        print("-------------- End Fold", i, "--------------")#Count the neighbors for knn
        print("--------------------------------------------")

#Count neighbors for knn
def count_neighbors_knn(knn):
    neighbor_count = [0, 0, 0]
    for n in knn:
        if n[iris_location] == Versicolor.integer_value:
            neighbor_count[1] = neighbor_count[1] + 1
        if n[iris_location] == Virginica.integer_value:
            neighbor_count[2] = neighbor_count[2] + 1
        if n[iris_location] == Setosa.integer_value:
            neighbor_count[0] = neighbor_count[0] + 1
    if neighbor_count[1] >= neighbor_count[0] and neighbor_count[1] >= neighbor_count[2]:
        return Versicolor.integer_value
    elif neighbor_count[0] >= neighbor_count[1] and neighbor_count[0] >= neighbor_count[2]:
        return Setosa.integer_value
    else:
        return Virginica.integer_value

#Perform knn evaluation
def k_nearest_neighbor(k, training_data, data_point, distance_type):
    neighbors = deepcopy(training_data[:k])
    for i in range(k, len(training_data)):
        for j in range(len(neighbors)):
            if "mahalanobis" in str(distance_type): #If the distance is mahalanobis
                covMatrix = np.cov(training_data[:4], bias=True)
                first_distance = distance_type(training_data[i][:iris_location], data_point[:iris_location], covMatrix)
                second_distance = distance_type(neighbors[j][:iris_location], data_point[:iris_location], covMatrix)
                if first_distance < second_distance:
                    neighbors[j] = deepcopy(training_data[i])
                    break
            else: #If the distance is euclidean or cosine
                first_distance = distance_type(training_data[i][:iris_location], data_point[:iris_location])
                second_distance = distance_type(neighbors[j][:iris_location], data_point[:iris_location])
                if first_distance < second_distance:
                    neighbors[j] = deepcopy(training_data[i])
                    break
    return neighbors

#Get the performance metrics 
def evaluate(k,training_data, testSet, distance_type):
    setosa = Setosa()
    versicolor = Versicolor()
    virginica = Virginica()
    for data_point in testSet:
        knn_value = k_nearest_neighbor(k, training_data, data_point, distance_type)
        class_value = count_neighbors_knn(knn_value)
        if class_value == data_point[iris_location]: #True values
            if class_value == versicolor.integer_value:
                setosa.true_negatives += 1
                versicolor.true_positives += 1
                virginica.true_negatives += 1
            if class_value == setosa.integer_value: 
                setosa.true_positives += 1
                versicolor.true_negatives += 1
                virginica.true_negatives+= 1
            if class_value == virginica.integer_value:
                setosa.true_negatives += 1
                versicolor.true_negatives += 1
                virginica.true_positives += 1
        else: #False values
            if class_value == virginica.integer_value:
                virginica.false_positives += 1
                if data_point[iris_location] == setosa.integer_value:
                    setosa.false_negatives += 1
                    versicolor.true_negatives += 1
                else:
                    setosa.true_negatives += 1
                    versicolor.false_negatives += 1   
            if class_value == setosa.integer_value:
                setosa.false_positives += 1
                if data_point[iris_location] == versicolor.integer_value:
                    versicolor.false_negatives += 1
                    virginica.true_negatives += 1
                else:
                    versicolor.true_negatives += 1
                    virginica.false_negatives += 1     
            if class_value == versicolor.integer_value:
                versicolor.false_positives += 1
                if data_point[iris_location] == setosa.integer_value:
                    setosa.false_negatives += 1
                    virginica.true_negatives += 1
                else:
                    setosa.true_negatives += 1
                    virginica.false_negatives += 1
    #Print performance for each iris
    setosa.performance()
    versicolor.performance()
    virginica.performance()
    #Print the Macro Averaged Values
    print("Macro-averaged Accuracy: ",  (setosa.accuracy() + versicolor.accuracy() + virginica.accuracy())/3)
    print("Macro-averaged Precision: ", (setosa.precision() + versicolor.precision() + virginica.precision())/3)
    print("Macro-averaged Recall: ",    (setosa.recall() + versicolor.recall() + virginica.recall())/3)
    print("Macro-averaged F1score: ",   (setosa.f1score() + versicolor.f1score() + virginica.f1score())/3)
    return (setosa, versicolor, virginica)

In [12]:
training_data = data.training_data
testing_data = data.testing_data

k = 7 #Change k as desired
distance_type = distance.cosine #Change to desired distance function (distance.euclidean, distance.mahalanobis, or distance.cosine)

print("K =", k, "distance_type =", distance_type.__name__)
crossValidation(k, distance_type) #Perform cross validation
print("\n\n\n")
print("--------------------------------------------")
print("---------------- Results -------------------")
print("--------------------------------------------")
print("\n")
evaluate(k, training_data, testing_data, distance_type) #Evaluate data using knn

K = 7 distance_type = cosine
--------------------------------------------
------------- Initiate Fold ----------------
--------------------------------------------
Iris type:  Iris-setosa
Accuracy:  1.0
Recall:  1.0
Precision:  1.0
F1 Score:  1.0
Iris type:  Iris-versicolor
Accuracy:  1.0
Recall:  1.0
Precision:  1.0
F1 Score:  1.0
Iris type:  Iris-virginica
Accuracy:  1.0
Recall:  1.0
Precision:  1.0
F1 Score:  1.0
Macro-averaged Accuracy:  1.0
Macro-averaged Precision:  1.0
Macro-averaged Recall:  1.0
Macro-averaged F1score:  1.0
--------------------------------------------
-------------- End Fold 0 --------------
--------------------------------------------
--------------------------------------------
------------- Initiate Fold ----------------
--------------------------------------------
Iris type:  Iris-setosa
Accuracy:  1.0
Recall:  1.0
Precision:  1.0
F1 Score:  1.0
Iris type:  Iris-versicolor
Accuracy:  1.0
Recall:  1.0
Precision:  1.0
F1 Score:  1.0
Iris type:  Iris-virginica

(<__main__.Setosa at 0x7f1dd0939630>,
 <__main__.Versicolor at 0x7f1dd0939550>,
 <__main__.Virginica at 0x7f1dd091ef98>)