forked from cms-sw/cmssw
-
Notifications
You must be signed in to change notification settings - Fork 5
/
gpuClusterTracksIterative.h
213 lines (183 loc) · 6.64 KB
/
gpuClusterTracksIterative.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
#include <algorithm>
#include <cmath>
#include <cstdint>
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
#include "gpuVertexFinder.h"
namespace gpuVertexFinder {
// this algo does not really scale as it works in a single block...
// enough for <10K tracks we have
__global__ void clusterTracksIterative(ZVertices* pdata,
WorkSpace* pws,
int minT, // min number of neighbours to be "core"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster
) {
constexpr bool verbose = false; // in principle the compiler should optmize out if false
if (verbose && 0 == threadIdx.x)
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
auto er2mx = errmax * errmax;
auto& __restrict__ data = *pdata;
auto& __restrict__ ws = *pws;
auto nt = ws.ntrks;
float const* __restrict__ zt = ws.zt;
float const* __restrict__ ezt2 = ws.ezt2;
uint32_t& nvFinal = data.nvFinal;
uint32_t& nvIntermediate = ws.nvIntermediate;
uint8_t* __restrict__ izt = ws.izt;
int32_t* __restrict__ nn = data.ndof;
int32_t* __restrict__ iv = ws.iv;
assert(pdata);
assert(zt);
using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
__shared__ Hist hist;
__shared__ typename Hist::Counter hws[32];
for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
hist.off[j] = 0;
}
__syncthreads();
if (verbose && 0 == threadIdx.x)
printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
assert(nt <= hist.capacity());
// fill hist (bin shall be wider than "eps")
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
assert(i < ZVertices::MAXTRACKS);
int iz = int(zt[i] * 10.); // valid if eps<=0.1
// iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only
iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
izt[i] = iz - INT8_MIN;
assert(iz - INT8_MIN >= 0);
assert(iz - INT8_MIN < 256);
hist.count(izt[i]);
iv[i] = i;
nn[i] = 0;
}
__syncthreads();
if (threadIdx.x < 32)
hws[threadIdx.x] = 0; // used by prefix scan...
__syncthreads();
hist.finalize(hws);
__syncthreads();
assert(hist.size() == nt);
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
hist.fill(izt[i], uint16_t(i));
}
__syncthreads();
// count neighbours
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (ezt2[i] > er2mx)
continue;
auto loop = [&](uint32_t j) {
if (i == j)
return;
auto dist = std::abs(zt[i] - zt[j]);
if (dist > eps)
return;
if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
return;
nn[i]++;
};
cms::cuda::forEachInBins(hist, izt[i], 1, loop);
}
__shared__ int nloops;
nloops = 0;
__syncthreads();
// cluster seeds only
bool more = true;
while (__syncthreads_or(more)) {
if (1 == nloops % 2) {
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
auto m = iv[i];
while (m != iv[m])
m = iv[m];
iv[i] = m;
}
} else {
more = false;
for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
auto p = hist.begin() + k;
auto i = (*p);
auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
if (nn[i] < minT)
continue; // DBSCAN core rule
auto loop = [&](uint32_t j) {
assert(i != j);
if (nn[j] < minT)
return; // DBSCAN core rule
auto dist = std::abs(zt[i] - zt[j]);
if (dist > eps)
return;
if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
return;
auto old = atomicMin(&iv[j], iv[i]);
if (old != iv[i]) {
// end the loop only if no changes were applied
more = true;
}
atomicMin(&iv[i], old);
};
++p;
for (; p < hist.end(be); ++p)
loop(*p);
} // for i
}
if (threadIdx.x == 0)
++nloops;
} // while
// collect edges (assign to closest cluster of closest point??? here to closest point)
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
// if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule
if (nn[i] >= minT)
continue; // DBSCAN edge rule
float mdist = eps;
auto loop = [&](int j) {
if (nn[j] < minT)
return; // DBSCAN core rule
auto dist = std::abs(zt[i] - zt[j]);
if (dist > mdist)
return;
if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
return; // needed?
mdist = dist;
iv[i] = iv[j]; // assign to cluster (better be unique??)
};
cms::cuda::forEachInBins(hist, izt[i], 1, loop);
}
__shared__ unsigned int foundClusters;
foundClusters = 0;
__syncthreads();
// find the number of different clusters, identified by a tracks with clus[i] == i;
// mark these tracks with a negative id.
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] == int(i)) {
if (nn[i] >= minT) {
auto old = atomicInc(&foundClusters, 0xffffffff);
iv[i] = -(old + 1);
} else { // noise
iv[i] = -9998;
}
}
}
__syncthreads();
assert(foundClusters < ZVertices::MAXVTX);
// propagate the negative id to all the tracks in the cluster.
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] >= 0) {
// mark each track in a cluster with the same id as the first one
iv[i] = iv[iv[i]];
}
}
__syncthreads();
// adjust the cluster id to be a positive value starting from 0
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
iv[i] = -iv[i] - 1;
}
nvIntermediate = nvFinal = foundClusters;
if (verbose && 0 == threadIdx.x)
printf("found %d proto vertices\n", foundClusters);
}
} // namespace gpuVertexFinder
#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h