In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./archive/bezdekIris.data")

In [3]:
label = df["Iris-setosa"]

In [4]:
df = df.drop("Iris-setosa", axis=1)
df

Unnamed: 0,5.1,3.5,1.4,0.2
0,4.9,3.0,1.4,0.2
1,4.7,3.2,1.3,0.2
2,4.6,3.1,1.5,0.2
3,5.0,3.6,1.4,0.2
4,5.4,3.9,1.7,0.4
...,...,...,...,...
144,6.7,3.0,5.2,2.3
145,6.3,2.5,5.0,1.9
146,6.5,3.0,5.2,2.0
147,6.2,3.4,5.4,2.3


In [5]:
def euclidean_distance(X, Y):
    return np.sqrt(np.sum(np.power(X - Y, 2), axis=1))

In [6]:
df = df.to_numpy()

In [7]:
d = euclidean_distance(df, df[0])
# d = euclidean_distance(data[0], data)
print(d)

[0.         0.3        0.33166248 0.60827625 1.09087121 0.50990195
 0.42426407 0.50990195 0.17320508 0.8660254  0.45825757 0.14142136
 0.678233   1.36014705 1.62788206 1.05356538 0.54772256 1.17473401
 0.83666003 0.70710678 0.76157731 0.78102497 0.55677644 0.64807407
 0.2236068  0.5        0.59160798 0.5        0.34641016 0.24494897
 0.678233   1.14891253 1.34164079 0.14142136 0.3        0.78740079
 0.60827625 0.50990195 0.45825757 0.52915026 0.81853528 0.54772256
 0.678233   0.98488578 0.14142136 0.84852814 0.36055513 0.81240384
 0.31622777 4.09633983 3.68646172 4.23674403 2.96984848 3.81182371
 3.39116499 3.86005181 2.14709106 3.78813938 2.80535203 2.46170673
 3.24499615 3.04138127 3.71214224 2.55929678 3.7        3.43365694
 2.97153159 3.69188299 2.79284801 3.89358447 3.07408523 4.01870626
 3.65650106 3.44673759 3.65102725 4.08044115 4.29534632 3.5383612
 2.41867732 2.7        2.57875939 2.85482048 4.11703777 3.39852909
 3.59722115 3.97869325 3.55808937 2.99833287 2.9291637  3.24345

In [146]:
def cluster_data(solutions, solution_idx):
    solution = solutions[solution_idx]
    feature_vector_length = df.shape[1]
    cluster_centers = []
    all_clusters_dists = []
    clusters = []
    clusters_sum_dist = []
    num_cluster = int(len(solution) / feature_vector_length)
    for clust_idx in range(num_cluster):
        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
        cluster_center_dists = euclidean_distance(df, cluster_centers[clust_idx])
        all_clusters_dists.append(np.array(cluster_center_dists))
    cluster_centers = np.array(cluster_centers)
    all_clusters_dists = np.array(all_clusters_dists)
    cluster_indices = np.argmin(all_clusters_dists, axis=0)
    for clust_idx in range(num_cluster):
        clusters.append(np.where(cluster_indices == clust_idx))
        if len(clusters[clust_idx]) == 0:
            clusters_sum_dist.append(0)
        else:
            clusters_sum_dist.append(np.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
    clusters_sum_dist = np.array(clusters_sum_dist)
    return cluster_centers, all_clusters_dists, clusters, clusters_sum_dist

In [147]:
def fitness_func(solutions, solution_idx):
    _, _, _, clusters_sum_dist = cluster_data(solutions, solution_idx)
    fitness = 1.0 / (np.sum(clusters_sum_dist) + 0.00000001)
    return fitness

In [148]:
##initial population
import random
population = []
def initial_population():
    feature_vector_length = df.shape[1]
    ##population has 100 members
    for i in range(200):
        ##default number of clusters is 3
        Chromosome = []
        for j in range(3):
            gen1 = float('%0.1f' % random.uniform(0, 8))
            gen2 = float('%0.1f' % random.uniform(2, 5))
            gen3 = float('%0.1f' % random.uniform(1, 7))
            gen4 = float('%0.1f' % random.uniform(0, 0.5))
            Chromosome.append(gen1)
            Chromosome.append(gen2)
            Chromosome.append(gen3)
            Chromosome.append(gen4)
        population.append(Chromosome)
    return population

In [149]:
##choose parents
##just solutions with fitness score more than 0.0025 are accepted
def choose_parents(population):
    parents = []
    for i in range(len(population)):
        fit = fitness_func(population, i)
        if fit >= 0.0025:
            parents.append(population[i])
    return parents

In [150]:
def doCrossover(parents):
    index_parent1 = random.randint(0, len(parents)- 1)
    index_parent2 = random.randint(0, len(parents) - 1)
    parent1 = parents[index_parent1]
    parent2 = parents[index_parent2]
    cut = random.randint(1, 2)
    child1 = np.concatenate((parent1[0:cut * 4], parent2[cut * 4 : 12]), axis=0)
    child2 = np.concatenate((parent2[0:cut * 4], parent1[cut * 4 : 12]), axis=0)
    fit_child1 = fitness_func([child1], 0)
    fit_child2 = fitness_func([child2], 0)
    if fit_child1 >= 0.0035:
        parents.append(child1)
    if fit_child2 >= 0.0035:
        parents.append(child2)

In [151]:
def crossover(parents):
    for i in range(100):
        doCrossover(parents)


In [152]:
def doMutation(parents):
    index_parent = random.randint(0, len(parents)- 1)
    cut = random.randint(0, 2)
    feature_to_mute = random.randint(0, 3)
    parent_to_mute = parents[index_parent]
    if feature_to_mute == 0:
        new_feature_val = float('%0.1f' % random.uniform(0, 8))
    elif feature_to_mute == 1:
        new_feature_val = float('%0.1f' % random.uniform(2, 5))
    elif feature_to_mute == 2:
        new_feature_val = float('%0.1f' % random.uniform(1, 7))
    elif feature_to_mute == 3:
        new_feature_val = float('%0.1f' % random.uniform(0, 0.5))
    else:
        new_feature_val = float('%0.1f' % random.uniform(2, 5))
    parent_to_mute[cut * 4 + feature_to_mute] = new_feature_val
    fit = fitness_func([parent_to_mute], 0)
    if fit < 0.0025:
        parents.pop(index_parent)

In [153]:
def mutation(parents):
    for i in range(20):
        doMutation(parents)

In [154]:
result = {}
for i in range(100):
    population = initial_population()
    parents = choose_parents(population)
    crossover(parents)
    mutation(parents)
    fits = []
    for j in range(len(parents)):
        fit = fitness_func(parents, j)
        fits.append(fit)
    fit_max = max(fits)
    idx = fits.index(fit_max)
    cluster_centers, all_clusters_dists, clusters, clusters_sum_dist = cluster_data(parents, idx)
    result.update({fit_max: [idx, cluster_centers, all_clusters_dists, clusters, clusters_sum_dist]})
    population.clear()


In [156]:
##after 100 generation, best result is:
max_key = next(iter(result))
for key in result:
    if result[key] > result[max_key]:
        max_key = key
print("best fitness score: ",max_key)
print("index of best result in parents population:"  , result[max_key][0])

best fitness score:  0.004837856865189446
index of best result in parents population: 146


In [157]:
cluster_centers = np.array(result[max_key][1])
print("cluster centers are:")
cluster_centers

cluster centers are:


array([[5. , 3.6, 1.3, 0.1],
       [4.1, 3.8, 3.3, 0.1],
       [6.8, 2.6, 4.6, 0.4]])

In [158]:
all_clusters_dists = np.array(result[max_key][2])
print("distance of each datapoint from centroids:")
all_clusters_dists

distance of each datapoint from centroids:


array([[0.6244998 , 0.50990195, 0.678233  , 0.14142136, 0.70710678,
        0.5       , 0.3       , 0.93273791, 0.54772256, 0.46904158,
        0.42426407, 0.64031242, 0.94339811, 0.90553851, 1.12249722,
        0.58309519, 0.26457513, 0.85440037, 0.36055513, 0.60827625,
        0.38729833, 0.50990195, 0.64807407, 0.67082039, 0.678233  ,
        0.46904158, 0.31622777, 0.31622777, 0.59160798, 0.6244998 ,
        0.57445626, 0.57445626, 0.79372539, 0.55677644, 0.42426407,
        0.51961524, 0.14142136, 0.85440037, 0.31622777, 0.2236068 ,
        1.40712473, 0.72801099, 0.59160798, 0.70710678, 0.67082039,
        0.38729833, 0.58309519, 0.38729833, 0.33166248, 4.17252921,
        3.78417759, 4.33358974, 3.26649659, 3.96736688, 3.57910603,
        3.9484174 , 2.5019992 , 3.92173431, 3.04959014, 2.86530976,
        3.39705755, 3.32565783, 3.86652299, 2.75317998, 3.8       ,
        3.59444015, 3.17804972, 3.94968353, 3.05450487, 4.01372645,
        3.25269119, 4.22137418, 3.82361086, 3.59

In [159]:
clusters = result[max_key][3]
print("clusters are:")
clusters

clusters are:


[(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
        dtype=int64),),
 (array([56, 59, 92, 97], dtype=int64),),
 (array([ 49,  50,  51,  52,  53,  54,  55,  57,  58,  60,  61,  62,  63,
          64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
          77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
          90,  91,  93,  94,  95,  96,  98,  99, 100, 101, 102, 103, 104,
         105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
         118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
         131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
         144, 145, 146, 147, 148], dtype=int64),)]

In [160]:
clusters_sum_dist = np.array(result[max_key][4])
print("sum of all distance of each datapoint from its centroid in each cluster:")
clusters_sum_dist

sum of all distance of each datapoint from its centroid in each cluster:


array([ 28.10248514,   7.97166666, 170.62894583])