Starting with setup by importing the data and normalizing all features by z-score

In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import umap

SEED = 98532

df_read = pd.read_table("seeds.tsv", header=None)

# normalize to z-scores
scalar = StandardScaler()
df = pd.DataFrame(scalar.fit_transform(df_read.iloc[:, :7]))

# add labels column back in
df = df.assign(**{"label": df_read.iloc[:, 7]})
df.columns = df.columns.astype(str)

Testing how many of each type of flower are present in the dataset. This did not end up being used.

In [None]:
num_each_label = {}
for i in range (1, 4):
    num_each_label[i] = len(df.loc[df["label"] == i])
num_each_label

Calculating and plotting the inertias for k-means where k is from 1 - 7

In [None]:
inertia = []
for i in range(1,8):
    kmeans = KMeans(n_clusters=i, random_state=SEED).fit(df)
    inertia += [kmeans.inertia_]

# elbow plot
plt.scatter(range(1, len(inertia) + 1), inertia)
plt.plot(range(1, len(inertia) + 1), inertia)
plt.xlabel("Number of Means in k-means")
plt.ylabel("Inertia")
plt.title("\"Elbow Plot\" of Inertia by k")
plt.show()

Plotting each feature compared to every other feature in a scatter plot.

In [None]:
color_map = {
    1: "red",
    2: "green",
    3: "blue"
}

for i in range(1, 7 + 1):
    for j in range(1, 7 + 1):
        if (j <= i):
            continue
        plt.subplot(7, 7, (i - 1) * 7 + j)
        plt.scatter(df.iloc[:, i - 1], df.iloc[:, j - 1], c=df["label"].map(color_map), s=1)
plt.show()

Plotting the feature combination that stood out the most individually

In [None]:

# i = 1, j = 7 was the best
plt.scatter(df.iloc[:, 0], df.iloc[:, 6], c=df["label"].map(color_map), s=5)
plt.xlabel("Feature 1")
plt.ylabel("Feature 7")
plt.title("Relationship Between Select Features of Samples")
plt.show()

Using Gaussian Random Projection to reduce our data to be visualized in 2 dimensions

In [None]:
transformer = GaussianRandomProjection(n_components=2, random_state=SEED)
df_random = transformer.fit_transform(df.iloc[:, :7])

# the green is pretty clustered, but the red and blue are mixing significantly
# when not seeding, there was an occasional good one, but most were pretty mixed up
plt.scatter(df_random[:, 0], df_random[:, 1], c=df["label"].map(color_map))
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Sample Features Projected in 2 Dimensions by Gaussian Random Projection")
plt.show()

Using UMAP to reduce our data to be visualized in 2 dimensions

In [None]:
reducer = umap.UMAP(random_state=SEED)
df_umap = reducer.fit_transform(df.iloc[:, :7])
plt.scatter(df_umap[:, 0], df_umap[:, 1], c=df["label"].map(color_map))
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Sample Features Projected in 2 Dimensions by UMAP")
plt.show()

Determining the accuracy and Rand-index of our 3-means clusters

In [None]:
kmeans = KMeans(n_clusters=3, random_state=SEED).fit(df)
k_labels = kmeans.labels_

same_cluster = np.zeros((df.shape[0], df.shape[0]))

for i in range(0, same_cluster.shape[0]):
    for j in range(0, same_cluster.shape[1]):
        if (i >= j):
            continue

        if (k_labels[i] == k_labels[j]):
            same_cluster[i][j] += 1
        if (df.iloc[i, :]["label"] == df.iloc[j, :]["label"]):
            same_cluster[i][j] += 1

same = 0
for i in range(0, same_cluster.shape[0]):
    for j in range(0, same_cluster.shape[1]):
        if (i >= j):
            continue

        if (same_cluster[i][j] != 1):
            same += 1

# finding the correct clustering by taking the clusters that are most commonly correct -- assumes that the model doesnt completely suck
correct_cluster = np.zeros((3, 3))
for i in range(0, same_cluster.shape[0]):
    correct_cluster[k_labels[i]][(df.iloc[i, :]["label"] - 1).astype(np.int32)] += 1
num_correct_cluster = 0
for i in range(0, correct_cluster.shape[0]):
    largest = correct_cluster[i][0]
    for j in range(1, correct_cluster[i].shape[0]):
        if correct_cluster[i][j] > largest:
            largest = correct_cluster[i][j]
    num_correct_cluster += largest

num_items = same_cluster.shape[0]

rand_index = same / ((pow(num_items, 2) - num_items) / 2)
accuracy = num_correct_cluster / num_items

rand_index, accuracy

Creating a hierarchal clustering of the samples by using Complete linkage with Euclidean distance, and graphing the dendrogram. Original labels are placed under the corresponding leaf. Except for one noticable section, for the most part, the clusters here are quite accurate.

In [None]:
labels_arr = np.array(df["label"])
linkage_matrix = linkage(df, method="complete", metric="euclidean")
dendrogram(linkage_matrix, labels=labels_arr)
plt.title("Dendrogram of Samples Clustered with Complete Linkage")
plt.xlabel("Original Label in Dataset")
plt.ylabel("Euclidean Distance")
plt.show()