-
Notifications
You must be signed in to change notification settings - Fork 72
/
multiclasssql.py
377 lines (330 loc) · 17 KB
/
multiclasssql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import numpy as np
import sys
import logging
sys.path.insert(1, '/home/vagrant/cpa-multiclass/CellProfiler-Analyst/cpa');
sys.path.insert(1, '/home/vagrant/cpa-multiclass/CellProfiler-Analyst/')
import threading
import cpa.sqltools
from .dbconnect import DBConnect, UniqueObjectClause, UniqueImageClause, image_key_columns, object_key_columns, GetWhereClauseForImages, GetWhereClauseForObjects, object_key_defs
from .properties import Properties
from .datamodel import DataModel
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
db = DBConnect()
p = Properties()
dm = DataModel()
temp_stump_table = "_stump"
temp_score_table = "_scores"
temp_class_table = "_class"
filter_table_prefix = '_filter_'
def create_perobject_class_table(classifier, classNames, updater):
'''
classifier: generalclassifier object
classNames: list/array of class names
RETURNS: Saves table with columns Table Number, Image Number, Object Number, class number, class name to a pre defined
table in the database (the class number is the predicted class)
'''
updater(0, "Preparing to score")
if p.class_table is None:
raise ValueError('"class_table" in properties file is not set.')
index_cols = UniqueObjectClause()
class_cols = UniqueObjectClause() + ', class_number, class'
class_col_defs = f"{object_key_defs()}, class VARCHAR ({max(map(len, classNames))}), class_number INT"
# Drop must be explicitly asked for Classifier.ScoreAll
print('Drop table...')
db.execute('DROP TABLE IF EXISTS %s'%(p.class_table))
print('Create table...')
db.execute('CREATE TABLE %s (%s)'%(p.class_table, class_col_defs))
print('Create index...')
db.execute('CREATE INDEX idx_%s ON %s (%s)'%(p.class_table, p.class_table, index_cols))
print('Getting data...')
chunk_size = 10000
cap = dm.get_total_object_count()
updater(0, "Classifying objects...")
for start in range(0, cap, chunk_size):
updater(int(start / cap * 100))
print(f"Classifying object... {start}")
data = db.execute(f'SELECT {UniqueObjectClause(p.object_table)}, {",".join(db.GetColnamesForClassifier())} '
f'FROM {p.object_table} LIMIT {start}, {chunk_size}')
print('Getting predictions...')
cell_data, object_keys = processData(data)
predicted_classes = classifier.Predict(cell_data)
try:
print('Preparing data table...')
# We need to pass a connection object to Pandas so it can do all the work for us.
connID = threading.currentThread().getName()
if connID not in db.connections:
db.connect()
conn = db.connections[connID]
class_data = pd.DataFrame(data=object_keys, columns=object_key_columns())
class_data["class"] = [classNames[i - 1] for i in predicted_classes]
class_data["class_number"] = predicted_classes
print('Writing to database...')
class_data.to_sql(p.class_table, conn, if_exists="append", index=False)
except:
# This is the old writing method, may still be necessary if a weird db connection type is used.
print("Faster database writing method failed, retrying with slow method...")
print('Drop table...')
db.execute('DROP TABLE IF EXISTS %s'%(p.class_table))
print('Create table...')
db.execute('CREATE TABLE %s (%s)'%(p.class_table, class_col_defs))
print('Create index...')
db.execute('CREATE INDEX idx_%s ON %s (%s)'%(p.class_table, p.class_table, index_cols))
if len(object_keys.shape) > 2:
expr = 'CASE '+ ''.join(["WHEN %s=%d AND %s=%d AND %s=%d THEN '%s'"%(p.table_id,
object_keys[ii][0], p.image_id, object_keys[ii][1], p.object_id, object_keys[ii][2], predicted_classes[ii] )
for ii in range(0, len(predicted_classes))])+ " END"
expr2 = 'CASE '+ ''.join(["WHEN %s=%d AND %s=%d AND %s=%d THEN '%s'"%(p.table_id,
object_keys[ii][0], p.image_id, object_keys[ii][1], p.object_id, object_keys[ii][2],
classNames[predicted_classes[ii] - 1]) for ii in range(0, len(predicted_classes))])+ " END"
elif len(object_keys.shape) == 2:
expr = 'CASE '+ ''.join(["WHEN %s=%d AND %s=%d THEN '%s'"%(p.image_id,
object_keys[ii][0], p.object_id, object_keys[ii][1], predicted_classes[ii] )
for ii in range(0, len(predicted_classes))])+ " END"
expr2 = 'CASE '+ ''.join(["WHEN %s=%d AND %s=%d THEN '%s'"%(p.image_id,
object_keys[ii][0], p.object_id, object_keys[ii][1], classNames[predicted_classes[ii] - 1])
for ii in range(0, len(predicted_classes))])+ " END"
else:
raise Exception(f'object keys have length {len(object_keys.shape)} but should have length >= 2')
print('Writing to database...')
db.execute('INSERT INTO %s (%s) SELECT %s, %s, %s FROM %s'%(p.class_table, class_cols, index_cols, expr, expr2, p.object_table),
silent=True)
db.Commit()
def FilterObjectsFromClassN(classNum, classifier, filterKeys, uncertain):
'''
uncertain: allows to search for uncertain (regarding the probs assigned by the classifier) cell images
classNum: 1-based index of the class to retrieve obKeys from
classifier: trained classifier object
filterKeys: (optional) A list of specific imKeys OR obKeys (NOT BOTH)
to classify.
* WARNING: If this list is too long, you may exceed the size limit to
MySQL queries.
* Useful when fetching N objects from a particular class. Use the
DataModel to get batches of random objects, and sift through them
here until N objects of the desired class have been accumulated.
* Also useful for classifying a specific image or group of images.
RETURNS: A list of object keys that fall in the specified class (but not all objects?),
if Properties.area_scoring_column is specified, area sums are also
reported for each class
'''
if filterKeys != [] and filterKeys is not None:
if isinstance(filterKeys, str):
whereclause = filterKeys #+ " AND"
else:
isImKey = len(filterKeys[0]) == len(image_key_columns())
if isImKey:
whereclause = GetWhereClauseForImages(filterKeys) #+ " AND"
else:
whereclause = GetWhereClauseForObjects(filterKeys) #+ " AND"
else:
whereclause = ""
if p.area_scoring_column:
data = db.execute('SELECT %s, %s FROM %s WHERE %s'%(UniqueObjectClause(p.object_table),
",".join(db.GetColnamesForClassifier()),
_objectify(p, p.area_scoring_column), p.object_table, whereclause))
area_score = data[-1] #separate area from data
data = data[:-1]
else:
data = db.execute('SELECT %s, %s FROM %s WHERE %s'%(UniqueObjectClause(p.object_table),
",".join(db.GetColnamesForClassifier()), p.object_table, whereclause))
cell_data, object_keys = processData(data)#, p.check_tables=='yes')
res = [] # list
if uncertain:
# Our requirement: if the two largest scores are smaller than threshold
probabilities = classifier.PredictProba(cell_data) #
threshold = 0.1 # TODO: This threshold should be adjustable
sorted_p = np.sort(probabilities)[:,-2:]# sorted array
diff = sorted_p[:,1] - sorted_p[:,0]
indices = np.where(diff < threshold)[0] # get all indices where this is true
res = [object_keys[i] for i in indices]
else:
predicted_classes = classifier.Predict(cell_data)
res = object_keys[predicted_classes == classNum * np.ones(predicted_classes.shape)].tolist() #convert to list
return list(map(tuple,res)) # ... and then to tuples
def processData(data):
#takes data from query and returns arrays for feature values and object keys
col_names = db.GetColnamesForClassifier()
number_of_features = len(col_names)
# Old method of generating data arrays
# cell_data = []
# object_keys = []
# for row in data:
# cell_data.append(row[-number_of_features:])#last number_of_features columns in row
# object_keys.append(row[:-number_of_features])#all elements in row before last (number_of_features) elements
# cell_data = np.array(cell_data)
# object_keys = np.array(object_keys)
# New method, Mar 2021
object_keys, cell_data = np.split(np.array(data), [-number_of_features], axis=1)
object_keys = np.array(list(map(tuple, object_keys))).astype(int)
# if numpy array is already floats, pass; if numpy array contains strings, convert
if not np.issubdtype(cell_data.dtype, float):
cell_data = np.where(cell_data == np.array(None), '0', cell_data).astype(str)
data_shape = cell_data.shape
try:
cell_data = np.apply_along_axis(pd.to_numeric, 1, cell_data, errors="coerce")
# print(('data type 1 ', cell_data.dtype))
except Exception as e:
logging.info("Data conversion failed, trying slower method - ", e)
try:
cell_data = np.reshape(np.genfromtxt(cell_data.ravel(), delimiter=','), data_shape)
except Exception as e:
logging.info("Fallback data conversion failed, will try proceeding anyway - ", e)
# print(('data type 2 ', cell_data.dtype))
cell_data = np.nan_to_num(cell_data)
logging.info('Any values that cannot be converted to float were set to 0')
return cell_data, object_keys
def _objectify(p, field):
return "%s.%s"%(p.object_table, field)
def _where_clauses(p, dm, filter_name):
imkeys = dm.GetAllImageKeys(filter_name)
imkeys.sort()
stepsize = max(len(imkeys) // 100, 50)
key_thresholds = imkeys[-1:1:-stepsize]
key_thresholds.reverse()
if len(key_thresholds) == 0:
return ['(1 = 1)']
if p.table_id:
# split each table independently
def splitter():
yield "(%s = %d) AND (%s <= %d)"%(_objectify(p, p.table_id), key_thresholds[0][0],
_objectify(p, p.image_id), key_thresholds[0][1])
for lo, hi in zip(key_thresholds[:-1], key_thresholds[1:]):
if lo[0] == hi[0]:
# block within one table
yield "(%s = %d) AND (%s > %d) AND (%s <= %d)"%(_objectify(p, p.table_id), lo[0],
_objectify(p, p.image_id), lo[1],
_objectify(p, p.image_id), hi[1])
else:
# query spans a table boundary
yield "(%s >= %d) AND (%s > %d)"%(_objectify(p, p.table_id), lo[0],
_objectify(p, p.image_id), lo[1])
yield "(%s <= %d) AND (%s <= %d)"%(_objectify(p, p.table_id), hi[0],
_objectify(p, p.image_id), hi[1])
return list(splitter())
else:
return (["(%s <= %d)"%(_objectify(p, p.image_id), key_thresholds[0][0])] +
["(%s > %d) AND (%s <= %d)"
%(_objectify(p, p.image_id), lo[0], _objectify(p, p.image_id), hi[0])
for lo, hi in zip(key_thresholds[:-1], key_thresholds[1:])])
def PerImageCounts(classifier, num_classes, filter_name=None, cb=None):
'''
classifier: trained classifier object
filter: name of filter, or None.
cb: callback function to update with the fraction complete
RETURNS: A list of lists of imKeys and respective object counts for each class:
Note that the imKeys are exploded so each row is of the form:
[TableNumber, ImageNumber, Class1_ObjectCount, Class2_ObjectCount,...]
where TableNumber is only present if table_id is defined in Properties.
If p.area_scoring_column is set, then area scores will be appended to
the object scores.
'''
# I'm pretty sure this would be even faster if we were to run two
# or more parallel threads and split the work between them.
# For each image clause, classify the cells using the model
# then for each image key, count the number in each class (and maybe area)
def do_by_steps(tables, filter_name, area_score=False):
filter_clause = '1 = 1'
join_clause = ''
if filter_name is not None:
filter = p._filters[filter_name]
if isinstance(filter, cpa.sqltools.OldFilter):
join_table = '(%s) as filter' % str(filter)
else:
if p.object_table in tables:
join_table = None
else:
join_table = p.object_table
filter_clause = str(filter)
if join_table:
join_clause = 'JOIN %s USING (%s)' % (join_table, ','.join(image_key_columns()))
wheres = _where_clauses(p, dm, filter_name)
num_clauses = len(wheres)
counts = {}
# iterate over where clauses to go through whole set
for idx, where_clause in enumerate(wheres):
if filter_clause is not None:
where_clause += ' AND ' + filter_clause
if area_score:
data = db.execute('SELECT %s, %s, %s FROM %s '
'%s WHERE %s'
%(UniqueImageClause(p.object_table),
",".join(db.GetColnamesForClassifier()),
_objectify(p, p.area_scoring_column), tables,
join_clause, where_clause),
silent=(idx > 10))
area_score = data[-1] #separate area from data
data = data[:-1]
else:
data = db.execute('SELECT %s, %s FROM %s '
'%s WHERE %s'
%(UniqueObjectClause(p.object_table),
",".join(db.GetColnamesForClassifier()), tables,
join_clause, where_clause),
silent=(idx > 10))
cell_data, image_keys = processData(data)
predicted_classes = classifier.Predict(cell_data)
for i in range(0, len(predicted_classes)):
row_cls = tuple(np.append(image_keys[i][0], predicted_classes[i]))
oneCount = np.array([1])
if area_score:
oneCount = np.append(oneCount, area_score[i])
if row_cls in counts:
counts[row_cls] += oneCount
else:
counts[row_cls] = oneCount
if cb:
cb(min(1, (idx + 1)/num_clauses)) #progress
return counts
counts = do_by_steps(p.object_table, filter_name, p.area_scoring_column)
def get_count(im_key, classnum):
return counts.get(im_key + (classnum, ), np.array([0]))[0]
def get_area(im_key, classnum):
return counts.get(im_key + (classnum, ), np.array([0, 0]))[1]
def get_results():
for imkey in dm.GetImageKeysAndObjectCounts(filter_name):
if p.area_scoring_column is None:
yield list(imkey[0]) + [get_count(imkey[0], cl) for cl in range(1, num_classes+1)]
else:
yield list(imkey[0]) + [get_count(imkey[0], cl) for cl in range(1, num_classes+1)] + [get_area(imkey[0], cl) for cl in range(1, num_classes+1)]
return list(get_results())
if __name__ == "__main__":
from .trainingset import TrainingSet
from io import StringIO
from . import generalclassifier
from .datatable import DataGrid
import wx
p = Properties()
db = DBConnect()
dm = DataModel()
props = '/vagrant/az-dnaonly.properties'
ts = '/vagrant/Anne_DNA_66.txt'
nRules = 5
filter = None
classifier = AdaBoostClassifier(n_estimators=nRules)
GC = generalclassifier.GeneralClassifier(classifier)
p.LoadFile(props)
trainingSet = TrainingSet(p)
trainingSet.Load(ts)
print((trainingSet.label_matrix.shape))
print((trainingSet.labels))
print((len(trainingSet.colnames)))
print((trainingSet.values.shape))
output = StringIO()
print(('Training classifier with '+str(nRules)+' rules...'))
labels = np.nonzero(trainingSet.label_matrix+1)[1] + 1 #base 1 classes
print((len(labels)))
GC.Train(labels,trainingSet.values)
num_classes = trainingSet.label_matrix.shape[1]
'''
table = PerImageCounts(GC.classifier, num_classes, filter_name=filter)
table.sort()
labels = ['table', 'image'] + list(trainingSet.labels) + list(trainingSet.labels)
for row in table:
print row'''
#obkey_list = FilterObjectsFromClassN(2, GC, filterKeys=None)
#for row in obkey_list:
# print row
#object_scores()
p.class_table = 'testmulticlassql'
create_perobject_class_table(GC, list(range(num_classes)))
#_objectify()