# k-Means Clustering Testing

## Setup

In [1]:
# Data Structures
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None

import numpy as np

# Random number generation
import random

# Visualization
import plotly.express as px

# Output Control
from IPython.display import display

## Data Loading

In [2]:
data = pd.read_csv("../data/input/iris.csv")

display(data.shape)
display(data.head())

(150, 6)

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species,Target
0,5.8,2.8,5.1,2.4,Iris-virginica,2
1,6.0,2.2,4.0,1.0,Iris-versicolor,1
2,5.5,4.2,1.4,0.2,Iris-setosa,0
3,7.3,2.9,6.3,1.8,Iris-virginica,2
4,5.0,3.4,1.5,0.2,Iris-setosa,0


## Data Preparation

In [3]:
preproc = data.iloc[:, 0:2]

display(preproc.shape)
display(preproc.head())

fig = px.scatter(
    preproc,
    x="SepalLength",
    y="SepalWidth"
)
fig.show()

(150, 2)

Unnamed: 0,SepalLength,SepalWidth
0,5.8,2.8
1,6.0,2.2
2,5.5,4.2
3,7.3,2.9
4,5.0,3.4


## Development

In [55]:
k = 3

# Creating copy of input data to preserve original
df = preproc.copy()

# Shuffling data to ensure initial clusters are random
df = df.sample(frac=1).reset_index(drop=True)

# Randomly splitting data into k clusters
clusters = np.array_split(df, k)

# Calculating centroids for each cluster
centroids = []

for cluster in clusters:
    centroids.append(cluster.mean())



[SepalLength    5.688
 SepalWidth     3.032
 dtype: float64,
 SepalLength    5.908
 SepalWidth     3.066
 dtype: float64,
 SepalLength    5.934
 SepalWidth     3.064
 dtype: float64]

In [4]:
# Create samples
samples = []

for index, row in preproc.iterrows():
    samples.append(row)

# Create clusters
k = 3
initial_centroids = []

# get min and max of each feature
feature_ranges = preproc.apply(lambda x: pd.Series([x.min(), x.max()])).T.values.tolist()

# generate random centroid coordinate
random_centroid_coordinates = []

for feature_range in feature_ranges:
    random_coordinate = np.random.uniform(feature_range[0], feature_range[1])

    random_centroid_coordinates.append(random_coordinate)

random_centroid = np.array(random_centroid_coordinates)

display(feature_ranges)
display(random_centroid)

test = preproc.iloc[1, :]
display(test)

distance = np.linalg.norm(random_centroid - test)
display(distance)

for i in range(k):
    print(i)

[[4.3, 7.9], [2.0, 4.4]]

array([4.95686534, 4.3024445 ])

SepalLength    6.0
SepalWidth     2.2
Name: 1, dtype: float64

2.3469986730594674

0
1
2
