forked from ahmedfgad/GeneticAlgorithmPython
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_clustering_3.py
134 lines (109 loc) · 6.12 KB
/
example_clustering_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy
import matplotlib.pyplot
import pygad
cluster1_num_samples = 20
cluster1_x1_start = 0
cluster1_x1_end = 5
cluster1_x2_start = 2
cluster1_x2_end = 6
cluster1_x1 = numpy.random.random(size=(cluster1_num_samples))
cluster1_x1 = cluster1_x1 * (cluster1_x1_end - cluster1_x1_start) + cluster1_x1_start
cluster1_x2 = numpy.random.random(size=(cluster1_num_samples))
cluster1_x2 = cluster1_x2 * (cluster1_x2_end - cluster1_x2_start) + cluster1_x2_start
cluster2_num_samples = 20
cluster2_x1_start = 4
cluster2_x1_end = 12
cluster2_x2_start = 14
cluster2_x2_end = 18
cluster2_x1 = numpy.random.random(size=(cluster2_num_samples))
cluster2_x1 = cluster2_x1 * (cluster2_x1_end - cluster2_x1_start) + cluster2_x1_start
cluster2_x2 = numpy.random.random(size=(cluster2_num_samples))
cluster2_x2 = cluster2_x2 * (cluster2_x2_end - cluster2_x2_start) + cluster2_x2_start
cluster3_num_samples = 20
cluster3_x1_start = 12
cluster3_x1_end = 18
cluster3_x2_start = 8
cluster3_x2_end = 11
cluster3_x1 = numpy.random.random(size=(cluster3_num_samples))
cluster3_x1 = cluster3_x1 * (cluster3_x1_end - cluster3_x1_start) + cluster3_x1_start
cluster3_x2 = numpy.random.random(size=(cluster3_num_samples))
cluster3_x2 = cluster3_x2 * (cluster3_x2_end - cluster3_x2_start) + cluster3_x2_start
c1 = numpy.array([cluster1_x1, cluster1_x2]).T
c2 = numpy.array([cluster2_x1, cluster2_x2]).T
c3 = numpy.array([cluster3_x1, cluster3_x2]).T
data = numpy.concatenate((c1, c2, c3), axis=0)
matplotlib.pyplot.scatter(cluster1_x1, cluster1_x2)
matplotlib.pyplot.scatter(cluster2_x1, cluster2_x2)
matplotlib.pyplot.scatter(cluster3_x1, cluster3_x2)
matplotlib.pyplot.title("Optimal Clustering")
matplotlib.pyplot.show()
def euclidean_distance(X, Y):
"""
Calculate the euclidean distance between X and Y. It accepts:
:X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample.
:Y should be of size f. In other words, it is a single sample.
Returns a vector of N elements with the distances between the N samples and the Y.
"""
return numpy.sqrt(numpy.sum(numpy.power(X - Y, 2), axis=1))
def cluster_data(solution, solution_idx):
"""
Clusters the data based on the current solution.
"""
global num_clusters, feature_vector_length, data
cluster_centers = [] # A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample.
all_clusters_dists = [] # A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples.
clusters = [] # A list with C elements where each element holds the indices of the samples within a cluster.
clusters_sum_dist = [] # A list with C elements where each element represents the sum of distances of the samples with a cluster.
for clust_idx in range(num_clusters):
# Return the current cluster center.
cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
# Calculate the distance (e.g. euclidean) between the current cluster center and all samples.
cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
all_clusters_dists.append(numpy.array(cluster_center_dists))
cluster_centers = numpy.array(cluster_centers)
all_clusters_dists = numpy.array(all_clusters_dists)
# A 1D array that, for each sample, holds the index of the cluster with the smallest distance.
# In other words, the array holds the sample's cluster index.
cluster_indices = numpy.argmin(all_clusters_dists, axis=0)
for clust_idx in range(num_clusters):
clusters.append(numpy.where(cluster_indices == clust_idx)[0])
# Calculate the sum of distances for the cluster.
if len(clusters[clust_idx]) == 0:
# In case the cluster is empty (i.e. has zero samples).
clusters_sum_dist.append(0)
else:
# When the cluster is not empty (i.e. has at least 1 sample).
clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
# clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx])))
clusters_sum_dist = numpy.array(clusters_sum_dist)
return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist
def fitness_func(solution, solution_idx):
_, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx)
# The tiny value 0.00000001 is added to the denominator in case the average distance is 0.
fitness = 1.0 / (numpy.sum(clusters_sum_dist) + 0.00000001)
return fitness
num_clusters = 3
feature_vector_length = data.shape[1]
num_genes = num_clusters * feature_vector_length
ga_instance = pygad.GA(num_generations=100,
sol_per_pop=10,
init_range_low=0,
init_range_high=20,
num_parents_mating=5,
keep_parents=2,
num_genes=num_genes,
fitness_func=fitness_func,
suppress_warnings=True)
ga_instance.run()
best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution()
print("Best solution is {bs}".format(bs=best_solution))
print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness))
print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation))
cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(best_solution, best_solution_idx)
for cluster_idx in range(num_clusters):
cluster_x = data[clusters[cluster_idx], 0]
cluster_y = data[clusters[cluster_idx], 1]
matplotlib.pyplot.scatter(cluster_x, cluster_y)
matplotlib.pyplot.scatter(cluster_centers[cluster_idx, 0], cluster_centers[cluster_idx, 1], linewidths=5)
matplotlib.pyplot.title("Clustering using PyGAD")
matplotlib.pyplot.show()