In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import kNN_modules
import random

plt.style.use('dark_background')
# Walkthrough Video Link: https://www.youtube.com/watch?v=mpfU9n4MzBE

In [None]:
df = pd.read_csv('Iris.csv')
df_setosa = df.loc[df['Species'] == "Iris-setosa"]
df_virginica = df.loc[df['Species'] == "Iris-virginica"]
df_versicolor = df.loc[df['Species'] == "Iris-versicolor"]
df.head()

In [None]:
fig = plt.figure(figsize = (10,8))
ax = fig.add_subplot(111, projection = '3d')
ax.set_xlabel("Sepal Length / cm")
ax.set_ylabel("Petal Length / cm")
ax.set_zlabel("Petal Width / cm")
ax.grid(False)

x, x1, x2 = df_setosa['SepalLengthCm'], df_versicolor['SepalLengthCm'], df_virginica['SepalLengthCm']
y, y1, y2 = df_setosa['PetalLengthCm'], df_versicolor['PetalLengthCm'], df_virginica['PetalLengthCm']
z, z1, z2 = df_setosa['PetalWidthCm'], df_versicolor['PetalWidthCm'], df_virginica['PetalWidthCm']

ax.scatter3D(x, y, z, s=50, color='azure') # Setosa
ax.scatter3D(x1, y1, z1, s=50, color='cyan') # Versicolor
ax.scatter3D(x2, y2, z2, s=50, color='navy') # Virginica
plt.show()

In [None]:
"""
Need to prepare the data for our KNN algorithm.
x = Sepal Length, y = Petal Length, z = Petal Width.
We want a list of known inputs as [x, y, z] and the known outputs as numbers.
At present the outputs are Iris-setosa, Iris-versicolor and Iris-virginica.
We want to change this to 1, 2 and 3 respectively.
"""

known_inputs = []
known_outputs = []
species_to_number = {'Iris-setosa': 1, 'Iris-versicolor': 2, 'Iris-virginica':3}

for index, row in df.iterrows():
    known_inputs.append([row['SepalLengthCm'], row['PetalLengthCm'], row['PetalWidthCm']])
    known_outputs.append(species_to_number[row['Species']])

print(known_inputs[0:5])
print(known_outputs[0:5])

In [None]:
sepal_length = 7.0
petal_length = 6.0
petal_width = 2.0

fig = plt.figure(figsize = (10,8))
ax = fig.add_subplot(111, projection = '3d')
ax.set_xlabel("Sepal Length / cm")
ax.set_ylabel("Petal Length / cm")
ax.set_zlabel("Petal Width / cm")
ax.grid(False)

x, x1, x2 = df_setosa['SepalLengthCm'], df_versicolor['SepalLengthCm'], df_virginica['SepalLengthCm']
y, y1, y2 = df_setosa['PetalLengthCm'], df_versicolor['PetalLengthCm'], df_virginica['PetalLengthCm']
z, z1, z2 = df_setosa['PetalWidthCm'], df_versicolor['PetalWidthCm'], df_virginica['PetalWidthCm']
x3, y3, z3 = sepal_length, petal_length, petal_width

ax.scatter3D(x, y, z, s=50, color='azure') # Setosa
ax.scatter3D(x1, y1, z1, s=50, color='cyan') # Versicolor
ax.scatter3D(x2, y2, z2, s=50, color='navy') # Virginica
ax.scatter3D(x3, y3, z3, s=50, color='red') # Test Point
plt.show()

In [None]:
# Test kNN prediction.
test = [x3, y3, z3]

prediction = kNN_modules.KNN(test, known_inputs, known_outputs, 5, True)
prediction

In [None]:
# Making a 3D map of points for the kNN algorithm domains.
setosa_x, setosa_y, setosa_z = [], [], []
versicolor_x, versicolor_y, versicolor_z = [], [], []
virginica_x, virginica_y, virginica_z = [], [], []

xs = [4.0 + (0.2 * x) for x in range(0, 21)]
ys = [1.0 + (0.2 * y) for y in range(0, 31)]
zs = [0.0 + (0.1 * z) for z in range(0, 26)]

for x in xs:
    for y in ys:
        for z in zs:
            prediction = kNN_modules.KNN([x, y, z], known_inputs, known_outputs, 5)
            if prediction == 1:
                setosa_x.append(x)
                setosa_y.append(y)
                setosa_z.append(z)
            elif prediction == 2:
                versicolor_x.append(x)
                versicolor_y.append(y)
                versicolor_z.append(z)
            elif prediction == 3:
                virginica_x.append(x)
                virginica_y.append(y)
                virginica_z.append(z)

In [None]:
fig = plt.figure(figsize = (10,8))
ax = fig.add_subplot(111, projection = '3d')
ax.set_xlabel("Sepal Length / cm")
ax.set_ylabel("Petal Length / cm")
ax.set_zlabel("Petal Width / cm")
ax.grid(False)

ax.scatter3D(setosa_x, setosa_y, setosa_z, s=10, color='azure') # Setosa
#ax.scatter3D(versicolor_x, versicolor_y, versicolor_z, s=10, color='cyan') # Versicolor
ax.scatter3D(virginica_x, virginica_y, virginica_z, s=10, color='navy') # Virginica
plt.show()

EXPLORING 4D DATA

We can't use 3D graphs to visualise our data very easily.

In [None]:
known_inputs_4D = []

for index, row in df.iterrows():
    known_inputs_4D.append([row['SepalLengthCm'], row['SepalWidthCm'], row['PetalLengthCm'], row['PetalWidthCm']])

print(known_inputs_4D[0:5])

In [None]:
# Accuracy Function - take a test set and perform prediction, comparing to a known output and will return a percentage accuracy.
def accuracy(test_inputs, test_outputs, train_inputs, train_outputs, k_value):
    count = 0
    for index, test in enumerate(test_inputs):
        if kNN_modules.KNN(test, train_inputs, train_outputs, k_value) == test_outputs[index]:
            count += 1
    return (count / len(test_inputs)) * 100

# A function to create the test and train sets.
def test_train(full_inputs, full_outputs, percent_test):
    test_inputs, test_outputs, train_inputs, train_outputs = [], [], [], []
    
    number_test = int((len(full_inputs) / 100) * percent_test)
    random_indices = random.sample(range(0, len(full_inputs)), number_test)
    for index, item in enumerate(full_inputs):
        if index in random_indices:
            test_inputs.append(item)
            test_outputs.append(full_outputs[index])
        else:
            train_inputs.append(item)
            train_outputs.append(full_outputs[index])
    return test_inputs, test_outputs, train_inputs, train_outputs

In [None]:
# Get the test and train sets.
test_inputs, test_outputs, train_inputs, train_outputs = test_train(known_inputs_4D, known_outputs, 30)
print(test_inputs[0:5])
print(test_outputs[0:5])
print(train_inputs[0:5])
print(train_outputs[0:5])

In [None]:
# Test accuracy of the kNN algorithm.
acc = accuracy(test_inputs, test_outputs, train_inputs, train_outputs, 5)
acc

In [None]:
# Can we optimise to find the best k-value?
k_list = []
accuracy_list = []
index_list = []

count = 0
for i in range(0, 30):
    test_inputs, test_outputs, train_inputs, train_outputs = test_train(known_inputs_4D, known_outputs, 30)
    for k in range(1, 16):
        acc = accuracy(test_inputs, test_outputs, train_inputs, train_outputs, k)
        k_list.append(k)
        accuracy_list.append(acc)
        index_list.append(count)
        count += 1

k_values = pd.Series(data=k_list, index=index_list)
accuracy_raw = pd.Series(data=accuracy_list, index=index_list)
ka_dict = {"k-value":k_values, "accuracy":accuracy_raw}
ka = pd.DataFrame(ka_dict)

In [None]:
ka.head()

In [None]:
sns.lineplot(data=ka, x="k-value", y="accuracy")