In [1]:
import numpy as np
import pandas as pd

In [244]:
df = pd.read_csv("./archive/iris.csv")
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [245]:
import copy
variety = df["variety"]
new_df = copy.deepcopy(df)
df = df.drop("variety", axis=1)
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [246]:
def euclidean_distance(X, Y):
    return np.sqrt(np.sum(np.power(X - Y, 2), axis=1))

In [247]:
df = df.to_numpy()

In [248]:
d = euclidean_distance(df, df[0])
# d = euclidean_distance(data[0], data)
print(d)

[0.         0.53851648 0.50990195 0.64807407 0.14142136 0.6164414
 0.51961524 0.17320508 0.92195445 0.46904158 0.37416574 0.37416574
 0.59160798 0.99498744 0.88317609 1.1045361  0.54772256 0.1
 0.74161985 0.33166248 0.43588989 0.3        0.64807407 0.46904158
 0.59160798 0.54772256 0.31622777 0.14142136 0.14142136 0.53851648
 0.53851648 0.38729833 0.6244998  0.80622577 0.45825757 0.37416574
 0.41231056 0.24494897 0.8660254  0.14142136 0.17320508 1.34907376
 0.76811457 0.45825757 0.6164414  0.59160798 0.36055513 0.58309519
 0.3        0.2236068  4.00374824 3.61662826 4.16413256 3.09354166
 3.79209705 3.41613817 3.78549865 2.34520788 3.74966665 2.88790582
 2.70370117 3.22800248 3.14642654 3.7        2.58069758 3.62767143
 3.43511281 3.00998339 3.76828874 2.88270706 3.85356977 3.0757113
 4.04722127 3.65786823 3.41613817 3.59722115 4.04722127 4.24499706
 3.53128872 2.49399278 2.81780056 2.70185122 2.89482297 4.13521463
 3.41174442 3.51994318 3.91152144 3.6180105  3.         3.02158899
 3.3

In [249]:
def cluster_data(solutions, solution_idx):
    solution = solutions[solution_idx]
    feature_vector_length = df.shape[1]
    cluster_centers = []
    all_clusters_dists = []
    clusters = []
    clusters_sum_dist = []
    num_cluster = int(len(solution) / feature_vector_length)
    for clust_idx in range(num_cluster):
        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
        cluster_center_dists = euclidean_distance(df, cluster_centers[clust_idx])
        all_clusters_dists.append(np.array(cluster_center_dists))
    cluster_centers = np.array(cluster_centers)
    all_clusters_dists = np.array(all_clusters_dists)
    cluster_indices = np.argmin(all_clusters_dists, axis=0)
    for clust_idx in range(num_cluster):
        clusters.append(np.where(cluster_indices == clust_idx))
        if len(clusters[clust_idx]) == 0:
            clusters_sum_dist.append(0)
        else:
            clusters_sum_dist.append(np.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
    clusters_sum_dist = np.array(clusters_sum_dist)
    return cluster_centers, all_clusters_dists, clusters, clusters_sum_dist

In [250]:
def fitness_func(solutions, solution_idx):
    _, _, _, clusters_sum_dist = cluster_data(solutions, solution_idx)
    fitness = 1.0 / (np.sum(clusters_sum_dist) + 0.00000001)
    return fitness

In [304]:
##initial population
import random
population = []
def initial_population():
    feature_vector_length = df.shape[1]
    ##population has 100 members
    for i in range(200):
        ##default number of clusters is 3
        Chromosome = []
        for j in range(3):
            gen1 = float('%0.5f' % random.uniform(0, 8))
            gen2 = float('%0.5f' % random.uniform(2, 5))
            gen3 = float('%0.5f' % random.uniform(1, 7))
            gen4 = float('%0.5f' % random.uniform(0, 0.5))
            Chromosome.append(gen1)
            Chromosome.append(gen2)
            Chromosome.append(gen3)
            Chromosome.append(gen4)
        population.append(Chromosome)
    return population

In [305]:
##choose parents
##just solutions with fitness score more than 0.0025 are accepted
def selection(population):
    parents = []
    for i in range(len(population)):
        fit = fitness_func(population, i)
        if fit >= 0.0025:
            parents.append(population[i])
    return parents

In [306]:
def doCrossover(parents):
    index_parent1 = random.randint(0, len(parents)- 1)
    index_parent2 = random.randint(0, len(parents) - 1)
    parent1 = parents[index_parent1]
    parent2 = parents[index_parent2]
    cut = random.randint(1, 2)
    child1 = np.concatenate((parent1[0:cut * 4], parent2[cut * 4 : 12]), axis=0)
    child2 = np.concatenate((parent2[0:cut * 4], parent1[cut * 4 : 12]), axis=0)
    fit_child1 = fitness_func([child1], 0)
    fit_child2 = fitness_func([child2], 0)
    if fit_child1 >= 0.0035:
        parents.append(child1)
    if fit_child2 >= 0.0035:
        parents.append(child2)

In [307]:
def crossover(parents):
    for i in range(100):
        doCrossover(parents)


In [308]:
def doMutation(parents):
    index_parent = random.randint(0, len(parents)- 1)
    cut = random.randint(0, 2)
    feature_to_mute = random.randint(0, 3)
    parent_to_mute = parents[index_parent]
    if feature_to_mute == 0:
        new_feature_val = float('%0.5f' % random.uniform(0, 8))
    elif feature_to_mute == 1:
        new_feature_val = float('%0.5f' % random.uniform(2, 5))
    elif feature_to_mute == 2:
        new_feature_val = float('%0.5f' % random.uniform(1, 7))
    elif feature_to_mute == 3:
        new_feature_val = float('%0.5f' % random.uniform(0, 0.5))
    else:
        new_feature_val = float('%0.5f' % random.uniform(2, 5))
    parent_to_mute[cut * 4 + feature_to_mute] = new_feature_val


In [309]:
def mutation(parents):
    for i in range(20):
        doMutation(parents)

In [310]:
result = {}
for i in range(150):
    population = initial_population()
    parents = selection(population)
    crossover(parents)
    mutation(parents)
    fits = []
    for j in range(len(parents)):
        fit = fitness_func(parents, j)
        fits.append(fit)
    fit_max = max(fits)
    idx = fits.index(fit_max)
    cluster_centers, all_clusters_dists, clusters, clusters_sum_dist = cluster_data(parents, idx)
    result.update({fit_max: [idx, cluster_centers, all_clusters_dists, clusters, clusters_sum_dist]})
    population.clear()


In [311]:
##after 100 generation, best result is:
max_key = next(iter(result))
for key in result:
    if result[key] > result[max_key]:
        max_key = key
print("best fitness score: ",max_key)
print("index of best result in parents population:"  , result[max_key][0])

best fitness score:  0.004808424555784932
index of best result in parents population: 159


In [312]:
cluster_centers = np.array(result[max_key][1])
print("cluster centers are:")
cluster_centers

cluster centers are:


array([[5.36283, 3.62483, 1.29194, 0.3009 ],
       [6.25444, 2.67311, 4.40537, 0.3172 ],
       [6.52493, 4.98671, 6.61971, 0.03817]])

In [313]:
all_clusters_dists = np.array(result[max_key][2])
print("distance of each datapoint from centroids:")
all_clusters_dists

distance of each datapoint from centroids:


array([[0.32637388, 0.7915061 , 0.79376943, 0.954372  , 0.39258109,
        0.5034222 , 0.80258078, 0.48545228, 1.21419846, 0.75717099,
        0.24597136, 0.68731937, 0.87133685, 1.26381245, 0.59203371,
        0.87617459, 0.29493374, 0.31038671, 0.55756785, 0.37822468,
        0.47814633, 0.35754708, 0.82336864, 0.61704774, 0.86443271,
        0.79192166, 0.53564532, 0.30913737, 0.31451536, 0.85141407,
        0.83504845, 0.32409861, 0.57961186, 0.609503  , 0.73697212,
        0.5751173 , 0.21129106, 0.51659066, 1.15225861, 0.41605037,
        0.38378889, 1.58104899, 1.05724544, 0.57583844, 0.69233078,
        0.84786079, 0.45259906, 0.88557773, 0.25113325, 0.50893802,
        3.96026513, 3.60356655, 4.13452245, 3.17895894, 3.78874675,
        3.47618727, 3.77972696, 2.49709149, 3.74131393, 2.98191581,
        2.85236532, 3.25171   , 3.20288244, 3.72715601, 2.62811147,
        3.59604337, 3.48943433, 3.06926211, 3.8026943 , 2.96007262,
        3.87592801, 3.0912059 , 4.07424446, 3.69

In [314]:
clusters = result[max_key][3]
print("clusters are:")
clusters

clusters are:


[(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
        dtype=int64),),
 (array([ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
          63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
          76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
          89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
         102, 103, 104, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116,
         118, 119, 120, 121, 123, 124, 125, 126, 127, 128, 129, 130, 132,
         133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
         146, 147, 148, 149], dtype=int64),),
 (array([105, 109, 117, 122, 131], dtype=int64),)]

In [315]:
clusters_sum_dist = np.array(result[max_key][4])
print("sum of all distance of each datapoint from its centroid in each cluster:")
clusters_sum_dist

sum of all distance of each datapoint from its centroid in each cluster:


array([ 32.23526825, 161.14281262,  14.59024396])

In [316]:
labels_idx = [0 for i in range(150)]
race = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}
labels = [0 for i in range(150)]
for i in range(3):
    for elems in list(clusters[i]):
        for e in elems:
            labels_idx[e] = i
            labels[e] = race[i]
labels

['Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Setosa',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',
 'Versicolor',


In [317]:
df2 = new_df.assign(cluster_number=labels_idx).assign(labels=labels)
df2

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,cluster_number,labels
0,5.1,3.5,1.4,0.2,Setosa,0,Setosa
1,4.9,3.0,1.4,0.2,Setosa,0,Setosa
2,4.7,3.2,1.3,0.2,Setosa,0,Setosa
3,4.6,3.1,1.5,0.2,Setosa,0,Setosa
4,5.0,3.6,1.4,0.2,Setosa,0,Setosa
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica,1,Versicolor
146,6.3,2.5,5.0,1.9,Virginica,1,Versicolor
147,6.5,3.0,5.2,2.0,Virginica,1,Versicolor
148,6.2,3.4,5.4,2.3,Virginica,1,Versicolor


In [318]:
##total accuracy
correct = 0
for i in range(len(df2)):
    if df2['labels'][i] == df2['variety'][i]:
        correct += 1
accuracy = correct / len(df2)
print("total accuracy: ",accuracy * 100)

total accuracy:  70.0
