/
run_dbscan.py
258 lines (192 loc) · 8.45 KB
/
run_dbscan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""
run_dbscan.py
DBSCAN cluster analysis on ROIs
Remove fiducial locs by filtering out locs
> 500 neighbours within 30 nm
INPUT:
Directory containing region(s) of interest .npy file(s)
MEASUREMENTS:
num: number of locs per cluster
xymedian: median pairwise distance between locs,
within a cluster in x-y
OUTPUT:
Cluster measurements saved as single file:
AllMeasurements.csv
Average cluster from nomalised x-y-z coordinates as .dat file:
AverageCluster-%i-%i-%i.dat
%i-%i-%i are histogram shape dimensions in x-y-z
Number of clusters per ROI as single file:
AllROI_NumberOfClusters.csv
Written by Christopher Bartlett and Alistair Curd
University of Leeds 11-Sept-2017
"""
import time
import os
from Tkinter import Tk
from tkFileDialog import askdirectory
import numpy as np
from sklearn.cluster import DBSCAN
from scipy.spatial import distance
def removefiducial(data):
"""
Remove fiducial localisations from palm3d files
Search filter applied to localisations
If localisation has > 1000 locs within [40,40,60] nm cuboid
- considered part of fiducial
Save all localisations other than fiducial for DBSCAN analysis
Args:
x-y-z palm3d localisation data as NumPy array
Returns:
Array of same shape as input without fiducial localisations
"""
# Distance to search for localisations
filterarray = np.array([40, 40, 60])
x_coords = []
y_coords = []
z_coords = []
loc = 0
while loc < len(data):
loc_a = data[loc]
# A filter to search for localisation coordinates within filter distance
# Returns an np array of True / False
testfilter = np.logical_and(data > loc_a - filterarray,
data < loc_a + filterarray)
chosenlocs = np.all(testfilter, axis=1)
subxyz = data[chosenlocs]
if len(subxyz) < 1000:
x_coords = np.append(x_coords, loc_a[0])
y_coords = np.append(y_coords, loc_a[1])
z_coords = np.append(z_coords, loc_a[2])
loc += 1
xyz = np.array([x_coords, y_coords, z_coords])
xyz = np.transpose(xyz)
print 'Locs classified as fiducial and removed: %i' %(len(data) - len(xyz))
return xyz
def run_dbscan(data, neighbourhood, nlocs):
"""
DBSCAN clustering on ROI
Args:
x-y-z palm3d localisation data as NumPy array
Search radius for DBSCAN
Min points for DBSCAN
Returns:
Arrays containing measurements for ROI
"""
# DBSCAN clustering
clusterer = DBSCAN(eps=neighbourhood, min_samples=nlocs).fit(data)
# Combine XYZ coordinates with cluster identity from DBSCAN
# Coordinates identified as "noise" are labelled with cluster id of -1
data_dbscan = np.c_[data, clusterer.labels_]
id_ = []
num = []
xymedian = []
xavg_cluster = []
yavg_cluster = []
zavg_cluster = []
for cluster_id in range(clusterer.labels_.max() + 1):
xyzclust = data_dbscan[data_dbscan[:, 3] == cluster_id]
loc_a = 0
while loc_a < len(xyzclust):
# Register localisation coordinates to cluster centroid position
xnorm = np.subtract(xyzclust[loc_a, 0], np.mean(xyzclust[:, 0]))
ynorm = np.subtract(xyzclust[loc_a, 1], np.mean(xyzclust[:, 1]))
znorm = np.subtract(xyzclust[loc_a, 2], np.mean(xyzclust[:, 2]))
# Only save coordinates if fall inside 500 nm cuboid around centroid
if xnorm > -250 and xnorm < 250:
if ynorm > -250 and ynorm < 250:
if znorm > -250 and znorm < 250:
xavg_cluster = np.append(xavg_cluster, xnorm)
yavg_cluster = np.append(yavg_cluster, ynorm)
zavg_cluster = np.append(zavg_cluster, znorm)
loc_a += 1
# Calculate pairwise Euclidean distances between localisations in XY
xydist = distance.pdist(xyzclust[:, 0:2], 'euclidean')
id_ = np.append(id_, cluster_id)
num = np.append(num, len(xyzclust))
xymedian = np.append(xymedian, np.median(xydist))
total_clusters = clusterer.labels_.max() + 1
print 'Clusters in this ROI: %i' % total_clusters
return (id_, num, xymedian,
xavg_cluster, yavg_cluster, zavg_cluster,
total_clusters)
def main():
"""
Calls removefiducial function
Calls run_dbscan function
User input to specify DBSCAN clustering parameters
"""
Tk().withdraw()
file_list = []
filename_total = []
id_total = []
num_total = []
xymedian_total = []
xavg_cluster_total = []
yavg_cluster_total = []
zavg_cluster_total = []
filename_roi = []
num_clusters = []
print '\nWhere are the .npy region(s) of interest file(s)?'
directory = askdirectory()
directory_list = os.listdir(directory)
for item in directory_list:
if item.startswith('Region') and item.endswith('.npy'):
file_list.append(item)
print '%i file(s) in this folder to to be analysed' % len(file_list)
# Parameters for DBSCAN clustering
search_radius = int(raw_input('\nWhat is the DBSCAN search radius?: '))
min_cluster_size = int(raw_input('How many locs within this radius?: '))
start_time = time.time()
for item in file_list:
print '\nAnalysing region %i/%i: %s' %(file_list.index(item) + 1,
len(file_list),
item[7:-4])
xyz = np.load(directory + '/' + item)
# Remove fiducial locs
xyz = removefiducial(data=xyz)
# Run DBSCAN analysis
(id_, num, xymedian,
xavg_cluster, yavg_cluster, zavg_cluster,
total_clusters) = run_dbscan(data=xyz,
neighbourhood=search_radius,
nlocs=min_cluster_size)
# Keep output from run_dbscan for each region of interest
filename_total = np.append(filename_total, np.full(len(id_), item[7:-4]))
id_total = np.append(id_total, id_)
num_total = np.append(num_total, num)
xymedian_total = np.append(xymedian_total, xymedian)
xavg_cluster_total = np.append(xavg_cluster_total, xavg_cluster)
yavg_cluster_total = np.append(yavg_cluster_total, yavg_cluster)
zavg_cluster_total = np.append(zavg_cluster_total, zavg_cluster)
filename_roi = np.append(filename_roi, item[7:-4])
num_clusters = np.append(num_clusters, total_clusters)
print '\nFinished DBSCAN analysis'
print '%.2f sec to analyse all ROIs' % (time.time() - start_time)
# Save all measurements as AllMeasurements.csv
parameters = np.transpose(np.array([filename_total,
id_total,
num_total,
xymedian_total]))
np.savetxt('%s\\AllMeasurements.csv' % directory, parameters,
fmt='%s', delimiter=',',
header='ROI,cluster ID,number of locs,xymedian')
# Save avg cluster as 3D histogram
# Histogram - 4 nm px bins within 500 nm bounding box
# %i-%i-%i = histogram shape for ImageJ .dat import
avg_cluster_total = np.transpose(np.array([xavg_cluster_total,
yavg_cluster_total,
zavg_cluster_total]))
edge = np.arange(-250, 254, step=4)
hist, _ = np.histogramdd(avg_cluster_total, bins=(edge, edge, edge))
hist.tofile('%s\\AverageCluster-%i-%i-%i.dat' % (directory,
hist.shape[2],
hist.shape[1],
hist.shape[0]))
# Save number of clusters / region
all_clusters_roi = np.transpose(np.array([filename_roi, num_clusters]))
np.savetxt('%s\\AllROI_NumberOfClusters.csv' % directory, all_clusters_roi,
fmt='%s', delimiter=',',
header='ROI, Number of Clusters')
return None
if __name__ == '__main__':
main()