forked from dbish/pc-means
-
Notifications
You must be signed in to change notification settings - Fork 0
/
seq_kmeans.c
159 lines (129 loc) · 3.62 KB
/
seq_kmeans.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
int **data;
double **clusters;
/*
* generate test data from the range
* 0 to max and set up global vars
* int n: number of data points
* int max: all data is within the (0, max) range
* int k: number of clusters
*/
void setup(int n, int max, int k){
int i;
data = (int **)malloc(n*sizeof(int *));
clusters = (double **)malloc(k*sizeof(double *));
//generate random data for testing
for (i = 0; i < n; i++){
data[i] = (int *)malloc(2*sizeof(int));
data[i][0] = (rand()%max);
data[i][1] = (rand()%max);
}
//initialize clusters variable
for (i = 0; i < k; i++){
clusters[i] = (double *)malloc(2*sizeof(double));
}
}
/* returns 2d euclidean distance
* double x0, y0: coordinates of first point: (x0, y0)
* double x1, y1: coordinates of second point: (x1, y1)
*/
double distance(double x0, double y0, double x1, double y1){
return sqrt((x0-x1)*(x0-x1)+(y0-y1)*(y0-y1));
}
/* find the cluter these coordinates belong to
* double x, y: coordinates of the point: (x, y)
* int k: number of cluters
*/
int assign_cluster(double x, double y, int k){
int i, cluster;
double min, cur;
min = distance(x, y, clusters[0][0], clusters[0][1]);
cluster = 0;
for (i = 1; i < k; i++){
cur = distance(x, y, clusters[i][0], clusters[i][1]);
if (cur < min){
min = cur;
cluster = i;
}
}
return cluster;
}
/*
* int n: number of data points
* int k: number of clusters
*/
void kmeans(int n, int k){
int i, count, cluster;
double percent_change;
int* cluster_membership;
double **new_clusters;
new_clusters = (double **)malloc(k*sizeof(double *));
cluster_membership = (int *)malloc(n*sizeof(int));
count = 0;
int threshold = 10;
percent_change = 100;
//choose first set of cluster centers
for (i = 0; i < k; i++){
clusters[i][0] = data[i][0];
clusters[i][1] = data[i][1];
}
//initialize new clusters variable
//it has three coords: [x][y][number of members]
for (i = 0; i < k; i++){
new_clusters[i] = (double *)malloc(3*sizeof(double));
new_clusters[i][0] = 0;
new_clusters[i][1] = 0;
new_clusters[i][2] = 0;
}
for (i = 0; i < n; i++){
cluster_membership[i] = -1;
}
while((percent_change > threshold) && (count < 100)){
percent_change = 0;
for(i = 0; i < n; i++){
//find the cluster it belongs to
cluster = assign_cluster(data[i][0], data[i][1], k);
//check to see if this data point changed clusters
if (cluster != cluster_membership[i]) {
percent_change++;
cluster_membership[i] = cluster;
}
//track the new cluster sum
new_clusters[cluster][0] += data[i][0];
new_clusters[cluster][1] += data[i][1];
new_clusters[cluster][2]++;
}
//calculate the new cluster centers and update
for (i = 0; i < k; i++){
if (new_clusters[i][2] > 0){
new_clusters[i][0] /= new_clusters[i][2];
new_clusters[i][1] /= new_clusters[i][2];
}
clusters[i][0] = new_clusters[i][0];
clusters[i][1] = new_clusters[i][1];
new_clusters[i][0] = new_clusters[i][1] = new_clusters[i][2] = 0;
}
percent_change /= n;
count++;
}
for (i = 0; i < n; i++){
printf("%d %d %d\n", data[i][0], data[i][1], cluster_membership[i]);
}
}
int main(int argc, char* argv[]){
int i, n, max, k;
n = strtol(argv[1], NULL, 10);
max = strtol(argv[2], NULL, 10);
k = strtol(argv[3], NULL, 10);
setup(n, max, k);
/*for (i = 0; i < 2; i++){
printf("id:%d x:%d y:%d\n", i, data[i][0], data[i][1]);
}
printf("Distance between points 0 and 1: %f\n", distance(data[0][0], data[0][1], data[1][0], data[1][1]));
*/
kmeans(n, k);
return 0;
}