/
metrics.py
706 lines (569 loc) · 27.8 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
import numpy as np
import oyaml as yaml
import pandas as pd
from sklearn.metrics import auc, confusion_matrix
import warnings
def confusion_matrix_fine(
Y_true, Y_pred, is_true_incomplete, is_pred_incomplete):
"""
Counts overall numbers of true positives (TP), false positives (FP),
and false negatives (FN) in the predictions of a system, for a number K
of fine-level classes within a given coarse category, in a dataset of N
different samples. In addition to the K so-called "complete" tags (i.e.
with a determinate fine-level category as well as a determinate
coarse-level category), we consider the potential presence of an "incomplete"
tag, i.e. denoting the presence of a class with a determinate coarse-level
category yet no determinate fine-level category. This incomplete tag
be present in either the prediction or the ground truth.
Our method for evaluating a multilabel classifier on potentially incomplete
knowledge of the ground truth consists of two parts, which are ultimately
aggregated into a single count.
For the samples with complete knowledge of both ground truth (Part I in the
code below), we simply apply classwise Boolean logic to compute TP, FP, and
FN independently for every fine-level tag, and finally aggregate across
all tags.
However, for the samples with incomplete knowledge of the ground truth
(Part II in the code below), we perform a "coarsening" of the prediction by
apply a disjunction on the fine-level complete tags as well as the
coarse incomplete tag. If that coarsened prediction is positive, the sample
produces a true positive; otherwise, it produces a false negative.
Samples which contain the incomplete tag in the prediction but not the
ground truth overlap Parts I and II. In this case, we sum the zero, one,
or multiple false alarm(s) from Part I with the one false alarm from Part II
to produce a final number of false positives FP.
Parameters
----------
Y_true: array of bool, shape = [n_samples, n_classes]
One-hot encoding of true presence for complete fine tags.
Y_true[n, k] is equal to 1 if the class k is truly present in sample n,
and equal to 0 otherwise.
Y_pred: array of bool, shape = [n_samples, n_classes]
One-hot encoding of predicted class presence for complete fine tags.
Y_true[n, k] is equal to 1 if the class k is truly present in sample n,
and equal to 0 otherwise.
is_true_incomplete: array of bool, shape = [n_samples]
One-hot encoding of true presence for the incomplete fine tag.
is_true[n] is equal to 1 if an item that truly belongs to the
coarse category at hand, but the fine-level tag of that item is
truly uncertain, or truly unlike any of the K available fine tags.
is_pred_incomplete: array of bool, shape = [n_samples]
One-hot encoding of predicted presence for the incomplete fine tag.
is_true[n] is equal to 1 if the system predicts the existence of an
item that does belongs to the coarse category at hand, yet its
fine-level tag of that item is uncertain or unlike any of the
K available fine tags.
Returns
-------
TP: int
Number of true positives.
FP: int
Number of false positives.
FN: int
Number of false negatives.
"""
## PART I. SAMPLES WITH COMPLETE GROUND TRUTH AND COMPLETE PREDICTION
# Negate the true_incomplete Boolean and replicate it K times, where
# K is the number of fine tags.
# For each sample and fine tag, this mask is equal to 0 if the
# ground truth contains the incomplete fine tag and 1 if the ground
# truth does not contain the incomplete fine tag.
# The result is a (N, K) matrix.
Y_true_complete = np.tile(np.logical_not(
is_true_incomplete)[:, np.newaxis], (1, Y_pred.shape[1]))
# Compute true positives for samples with complete ground truth.
# For each sample n and each complete tag k, is_TP_complete is equal to 1
# if and only if the following two conditions are met:
# (i) the ground truth of sample n contains complete fine tag k
# (ii) the prediction of sample n contains complete fine tag k
# The result is a (N, K) matrix.
is_TP_complete = np.logical_and.reduce((Y_true, Y_pred))
# Compute false positives for samples with complete ground truth.
# For each sample n and each complete tag k, is_FP_complete is equal to 1
# if and only if the following three conditions are met:
# (i) the ground truth of sample n is complete
# (ii) the ground truth of sample n does not contain complete fine tag k
# (iii) the prediction of sample n contains complete fine tag k
# The result is a (N, K) matrix.
is_FP_complete = np.logical_and.reduce(
(np.logical_not(Y_true), Y_pred, Y_true_complete))
# Compute false negatives for samples with complete ground truth.
# For each sample n and each complete tag k, is_FN_complete is equal to 1
# if and only if the following two conditions are met:
# (i) the ground truth of sample n contains complete fine tag k
# (ii) the prediction of sample n does not contain complete fine tag k
# The result is a (N, K) matrix.
is_FN_complete = np.logical_and(Y_true, np.logical_not(Y_pred))
## PART II. SAMPLES WITH INCOMPLETE GROUND TRUTH OR INCOMPLETE PREDICTION.
# Compute a vector of "coarsened prediction".
# For each sample, the coarsened prediction is equal to 1 if any of the
# complete fine tags is predicted as present, or if the incomplete fine
# tag is predicted as present. Conversely, it is set equal to 0 if all
# of the complete fine tags are predicted as absent, and if the incomplete
# fine tags are predicted as absent.
# The result is a (N,) vector.
y_pred_coarsened_without_incomplete = np.logical_or.reduce(Y_pred, axis=1)
y_pred_coarsened = np.logical_or(
y_pred_coarsened_without_incomplete, is_pred_incomplete)
# Compute a vector of "coarsened ground truth".
# For each sample, the coarsened ground truth is equal to 1 if none of the
# complete fine tags are truly present, and if the incomplete fine tag is
# truly present. Conversely, it is set equal to 0 if any of the complete
# fine tags is truly present, or if the incomplete fine tag is truly absent.
# The result is a (N,) vector.
y_true_coarsened_without_incomplete =\
np.logical_and.reduce(np.logical_not(Y_true), axis=1)
y_true_coarsened = np.logical_and(
y_true_coarsened_without_incomplete, is_true_incomplete)
# Compute true positives for samples with incomplete ground truth.
# For each sample n, is_TP_incomplete is equal to 1
# if and only if the following three conditions are met:
# (i) the ground truth contains the incomplete fine tag
# (ii) the coarsened prediction of sample n contains at least one tag
# (iii) none of the predicted complete tags match a true complete tag
# The result is a (N,) vector.
is_TP_incomplete = np.logical_and.reduce((
is_true_incomplete,
y_pred_coarsened,
np.logical_and.reduce(np.logical_not(is_TP_complete), axis=1)))
# Compute false positives for samples with incomplete ground truth.
# For each sample n, is_FP_incomplete is equal to 1
# if and only if the following two conditions are met:
# (i) the ground truth does not contain the incomplete fine tag
# (ii) no complete fine tags are in the ground truth
# (iii) the prediction contains the incomplete fine tag
# (iv) not all complete fine tags are in the prediction
# The result is a (N,) vector.
is_FP_incomplete = np.logical_and.reduce((
np.logical_not(is_true_incomplete),
np.logical_not(np.logical_or.reduce(Y_true, axis=1)),
is_pred_incomplete,
np.logical_not(np.logical_and.reduce(Y_pred, axis=1))))
# Compute false negatives for samples with incomplete ground truth.
# For each sample n, is_FN_incomplete is equal to 1
# if and only if the following two conditions are met:
# (i) the incomplete fine tag is present in the ground truth
# (ii) the coarsened prediction of sample n does not contain any tag
# The result is a (N,) vector.
is_FN_incomplete = np.logical_and(
y_true_coarsened, np.logical_not(y_pred_coarsened))
## PART III. AGGREGATE EVALUATION OF ALL SAMPLES
# The following three sums are performed over NxK Booleans,
# implicitly converted as integers 0 (False) and 1 (True).
TP_complete = np.sum(is_TP_complete)
FP_complete = np.sum(is_FP_complete)
FN_complete = np.sum(is_FN_complete)
# The following three sums are performed over N Booleans,
# implicitly converted as integers 0 (False) and 1 (True).
TP_incomplete = np.sum(is_TP_incomplete)
FP_incomplete = np.sum(is_FP_incomplete)
FN_incomplete = np.sum(is_FN_incomplete)
# Sum FP, TP, and FN for samples that have complete ground truth
# with FP, TP, and FN for samples that have incomplete ground truth.
TP = TP_complete + TP_incomplete
FP = FP_complete + FP_incomplete
FN = FN_complete + FN_incomplete
return TP, FP, FN
def confusion_matrix_coarse(y_true, y_pred):
"""
Counts overall numbers of true positives (TP), false positives (FP),
and false negatives (FN) in the predictions of a system, for a single
Boolean attribute, in a dataset of N different samples.
Parameters
----------
y_true: array of bool, shape = [n_samples,]
One-hot encoding of true presence for a given coarse tag.
y_true[n] is equal to 1 if the tag is present in the sample.
y_pred: array of bool, shape = [n_samples,]
One-hot encoding of predicted presence for a given coarse tag.
y_pred[n] is equal to 1 if the tag is present in the sample.
Returns
-------
TP: int
Number of true positives.
FP: int
Number of false positives.
FN: int
Number of false negatives.
"""
cm = confusion_matrix(y_true, y_pred)
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]
return TP, FP, FN
def evaluate(prediction_path, annotation_path, yaml_path, mode):
# Set minimum threshold.
min_threshold = 0.01
# Create dictionary to parse tags
with open(yaml_path, 'r') as stream:
yaml_dict = yaml.load(stream, Loader=yaml.Loader)
# Parse ground truth.
gt_df = parse_ground_truth(annotation_path, yaml_path)
# Parse predictions.
if mode == "fine":
pred_df = parse_fine_prediction(prediction_path, yaml_path)
elif mode == "coarse":
pred_df = parse_coarse_prediction(prediction_path, yaml_path)
# Check consistency between ground truth and predictions.
# Make sure the files evaluated in both tables match.
pred_audio_set = set(pred_df['audio_filename'].tolist())
true_audio_set = set(gt_df['audio_filename'].tolist())
if not (pred_audio_set == true_audio_set):
extra_files = pred_audio_set - true_audio_set
missing_files = true_audio_set - pred_audio_set
err_msg =\
"File mismatch between ground truth and prediction table.\n\n" \
"Missing files: {}\n\n Extra files: {}"
raise ValueError(err_msg.format(list(missing_files), list(extra_files)))
# Make sure the size of the tables match
if not (len(gt_df) == len(pred_df)):
err_msg =\
"Size mismatch between ground truth ({} files) " \
"and prediction table ({} files)."
raise ValueError(err_msg.format(len(gt_df), len(pred_df)))
# Initialize dictionary of DataFrames.
df_dict = {}
# Loop over coarse categories.
for coarse_id in yaml_dict["coarse"]:
# List columns corresponding to that category
if mode == "coarse":
columns = [str(coarse_id)]
else:
columns = [column for column in pred_df.columns
if (str(column).startswith(str(coarse_id))) and
("-" in str(column)) and
(not str(column).endswith("X"))]
# Sort columns in alphanumeric order.
columns.sort()
# Restrict prediction to columns of interest.
restricted_pred_df = pred_df[columns]
# Restrict ground truth to columns of interest.
restricted_gt_df = gt_df[columns]
# Aggregate all prediction values into a "raveled" vector.
# We make an explicit numpy, so that the original DataFrame
# is left unchanged.
thresholds = np.ravel(np.copy(restricted_pred_df.values))
# Sort in place.
thresholds.sort()
# Skip very low values.
# This is to speed up the computation of the precision-recall curve
# in the low-precision regime.
thresholds = thresholds[np.searchsorted(thresholds, min_threshold):]
# Append a 1 to the list of thresholds.
# This will cause TP and FP to fall down to zero, but FN will be nonzero.
# This is useful for estimating the low-recall regime, and it
# facilitates micro-averaged AUPRC because if provides an upper bound
# on valid thresholds across coarse categories.
thresholds = np.append(thresholds, 1.0)
# List thresholds by restricting observed confidences to unique elements.
thresholds = np.unique(thresholds)[::-1]
# Count number of thresholds.
n_thresholds = len(thresholds)
TPs = np.zeros((n_thresholds,)).astype('int')
FPs = np.zeros((n_thresholds,)).astype('int')
FNs = np.zeros((n_thresholds,)).astype('int')
# FINE MODE.
if mode == "fine":
incomplete_tag = str(coarse_id) + "-X"
# Load ground truth as numpy array.
Y_true = restricted_gt_df.values
is_true_incomplete = gt_df[incomplete_tag].values
# Loop over thresholds in a decreasing order.
for i, threshold in enumerate(thresholds):
# Threshold prediction for complete tag.
Y_pred = restricted_pred_df.values >= threshold
# Threshold prediction for incomplete tag.
is_pred_incomplete =\
pred_df[incomplete_tag].values >= threshold
# Evaluate.
TPs[i], FPs[i], FNs[i] = confusion_matrix_fine(
Y_true, Y_pred, is_true_incomplete, is_pred_incomplete)
# COARSE MODE.
elif mode == "coarse":
# Load ground truth as numpy array.
Y_true = restricted_gt_df.values
# Loop over thresholds in a decreasing order.
for i, threshold in enumerate(thresholds):
# Threshold prediction.
Y_pred = restricted_pred_df.values >= threshold
# Evaluate.
TPs[i], FPs[i], FNs[i] = confusion_matrix_coarse(Y_true, Y_pred)
# Build DataFrame from columns.
eval_df = pd.DataFrame({
"threshold": thresholds, "TP": TPs, "FP": FPs, "FN": FNs})
# Add columns for precision, recall, and F1-score.
# NB: we take the maximum between TPs+FPs and mu=0.5 in the
# denominator in order to avoid division by zero.
# This only ever happens if TP+FP < 1, which
# implies TP = 0 (because TP and FP are nonnegative integers),
# and therefore a numerator of exactly zero. Therefore, any additive
# offset mu would do as long as 0 < mu < 1. Choosing mu = 0.5 is
# purely arbitrary and has no effect on the outcome (i.e. zero).
mu = 0.5
eval_df["P"] = TPs / np.maximum(TPs + FPs, mu)
# Likewise for recalls, although this numerical safeguard is probably
# less necessary given that TP+FN=0 implies that there are zero
# positives in the ground truth, which is unlikely but no unheard of.
eval_df["R"] = TPs / np.maximum(TPs + FNs, mu)
# Compute F1-scores.
# NB: we use the harmonic mean formula (2/F = 1/P + 1/R) rather than
# the more common F = (2*P*R)/(P+R) in order circumvent the edge case
# where both P and R are equal to 0 (i.e. TP = 0).
eval_df["F"] = 2 / (1/eval_df["P"] + 1/eval_df["R"])
# Store DataFrame in the dictionary.
df_dict[coarse_id] = eval_df
# Return dictionary.
return df_dict
def micro_averaged_auprc(df_dict, return_df=False):
"""
Compute micro-averaged area under the precision-recall curve (AUPRC)
from a dictionary of class-wise DataFrames obtained via `evaluate`.
"""
# List all unique values of thresholds across coarse categories.
thresholds = np.unique(
np.hstack([x["threshold"] for x in df_dict.values()]))
# Count number of unique thresholds.
n_thresholds = len(thresholds)
# Initialize arrays for TP, FP, and FN
TPs = np.zeros((n_thresholds,)).astype('int')
FPs = np.zeros((n_thresholds,)).astype('int')
FNs = np.zeros((n_thresholds,)).astype('int')
# Loop over thresholds.
for i, threshold in enumerate(thresholds):
# Initialize counters of TP, FP, and FN across all categories.
global_TP, global_FP, global_FN = 0, 0, 0
# Loop over coarse categories.
for coarse_id in df_dict.keys():
# Find last row above threshold.
coarse_df = df_dict[coarse_id]
coarse_thresholds = coarse_df["threshold"]
row = coarse_df[coarse_thresholds>=threshold].iloc[-1]
# Increment TP, FP, and FN.
global_TP += row["TP"]
global_FP += row["FP"]
global_FN += row["FN"]
# Store micro-averaged values of TP, FP, and FN for the given threshold.
TPs[i] = global_TP
FPs[i] = global_FP
FNs[i] = global_FN
# Build DataFrame from columns.
eval_df = pd.DataFrame({
"threshold": thresholds, "TP": TPs, "FP": FPs, "FN": FNs})
# Add columns for precision, recall, and F1-score.
# NB: we take the maximum between TPs+FPs and mu = 0.5 in the
# denominator in order to avoid division by zero.
# This only ever happens if TP+FP < 1, which
# implies TP = 0 (because TP and FP are nonnegative integers),
# and therefore a numerator of exactly zero. Therefore, any additive
# offset mu would do as long as 0 < mu < 1. Choosing mu = 0.5 is
# purely arbitrary and has no effect on the outcome (i.e. zero).
mu = 0.5
eval_df["P"] = TPs / np.maximum(TPs + FPs, mu)
# Likewise for recalls, although this numerical safeguard is probably
# less necessary given that TP+FN=0 implies that there are zero
# positives in the ground truth, which is unlikely but no unheard of.
eval_df["R"] = TPs / np.maximum(TPs + FNs, mu)
# Sort PR curve by ascending recall.
sorting_indices = np.argsort(list(eval_df["R"]))
recalls = np.array([0.0] + list(eval_df["R"][sorting_indices]) + [1.0])
precisions = np.array([1.0] + list(eval_df["P"][sorting_indices]) + [0.0])
auprc = auc(recalls, precisions)
# If the DataFrame containing the full P-R curve is requested.
if return_df:
# Compute F1-scores.
# NB: we use the harmonic mean formula (2/F = 1/P + 1/R) rather than
# the more common F = (2*P*R)/(P+R) in order circumvent the edge case
# where both P and R are equal to 0 (i.e. TP = 0).
eval_df["F"] = 2 / (1/eval_df["P"] + 1/eval_df["R"])
# Return
return auprc, eval_df
else:
# Otherwise, return only the AUPRC as a scalar.
return auprc
def macro_averaged_auprc(df_dict, return_classwise=False):
"""
Compute macro-averaged area under the precision-recall curve (AUPRC)
from a dictionary of class-wise DataFrames obtaines via `evaluate`.
"""
# Initialize list of category-wise AUPRCs.
auprcs = []
coarse_id_list = df_dict.keys()
# Loop over coarse categories.
for coarse_id in coarse_id_list:
# Load precisions and recalls.
# NB: we prepend a (1,0) and append a (0,1) to the curve so that the
# curve reaches the top-left and bottom-right quadrants of the
# precision-recall square.
sorting_indices = df_dict[coarse_id]["R"].argsort()
recalls = np.array(
[0.0] + list(df_dict[coarse_id]["R"][sorting_indices]) + [1.0])
precisions = np.array(
[1.0] + list(df_dict[coarse_id]["P"][sorting_indices]) + [0.0])
auprcs.append(auc(recalls, precisions))
# Average AUPRCs across coarse categories with uniform weighting.
mean_auprc = np.mean(auprcs)
if return_classwise:
class_auprc = {coarse_id: auprc
for coarse_id, auprc in zip(coarse_id_list, auprcs)}
return mean_auprc, class_auprc
else:
return mean_auprc
def parse_coarse_prediction(pred_csv_path, yaml_path):
"""
Parse coarse-level predictions from a CSV file containing both fine-level
and coarse-level predictions (and possibly additional metadata).
Returns a Pandas DataFrame in which the column names are coarse
IDs of the form 1, 2, 3 etc.
Parameters
----------
pred_csv_path: string
Path to the CSV file containing predictions.
yaml_path: string
Path to the YAML file containing coarse taxonomy.
Returns
-------
pred_coarse_df: DataFrame
Coarse-level complete predictions.
"""
# Create dictionary to parse tags
with open(yaml_path, 'r') as stream:
yaml_dict = yaml.load(stream, Loader=yaml.Loader)
# Collect tag names as strings and map them to coarse ID pairs.
rev_coarse_dict = {"_".join([str(k), yaml_dict["coarse"][k]]): k
for k in yaml_dict["coarse"]}
# Read comma-separated values with the Pandas library
pred_df = pd.read_csv(pred_csv_path)
# Assign a predicted column to each coarse key, by using the tag as an
# intermediate hashing step.
pred_coarse_dict = {}
for c in rev_coarse_dict:
if c in pred_df:
pred_coarse_dict[str(rev_coarse_dict[c])] = pred_df[c]
else:
pred_coarse_dict[str(rev_coarse_dict[c])] = np.zeros((len(pred_df),))
warnings.warn("Column not found: " + c)
# Copy over the audio filename strings corresponding to each sample.
pred_coarse_dict["audio_filename"] = pred_df["audio_filename"]
# Build a new Pandas DataFrame with coarse keys as column names.
pred_coarse_df = pd.DataFrame.from_dict(pred_coarse_dict)
# Return output in DataFrame format.
# The column names are of the form 1, 2, 3, etc.
return pred_coarse_df.sort_values('audio_filename')
def parse_fine_prediction(pred_csv_path, yaml_path):
"""
Parse fine-level predictions from a CSV file containing both fine-level
and coarse-level predictions (and possibly additional metadata).
Returns a Pandas DataFrame in which the column names are mixed (coarse-fine)
IDs of the form 1-1, 1-2, 1-3, ..., 1-X, 2-1, 2-2, 2-3, ... 2-X, 3-1, etc.
Parameters
----------
pred_csv_path: string
Path to the CSV file containing predictions.
yaml_path: string
Path to the YAML file containing fine taxonomy.
Returns
-------
pred_fine_df: DataFrame
Fine-level complete predictions.
"""
# Create dictionary to parse tags
with open(yaml_path, 'r') as stream:
yaml_dict = yaml.load(stream, Loader=yaml.Loader)
# Collect tag names as strings and map them to mixed (coarse-fine) ID pairs.
# The "mixed key" is a hyphenation of the coarse ID and fine ID.
fine_dict = {}
for coarse_id in yaml_dict["fine"]:
for fine_id in yaml_dict["fine"][coarse_id]:
mixed_key = "-".join([str(coarse_id), str(fine_id)])
fine_dict[mixed_key] = "_".join([
mixed_key, yaml_dict["fine"][coarse_id][fine_id]])
# Invert the key-value relationship between mixed key and tag.
# Now, tags are the keys, and mixed keys (coarse-fine IDs) are the values.
# This is possible because tags are unique.
rev_fine_dict = {fine_dict[k]: k for k in fine_dict}
# Read comma-separated values with the Pandas library
pred_df = pd.read_csv(pred_csv_path)
# Assign a predicted column to each mixed key, by using the tag as an
# intermediate hashing step.
pred_fine_dict = {}
for f in sorted(rev_fine_dict.keys()):
if f in pred_df:
pred_fine_dict[rev_fine_dict[f]] = pred_df[f]
else:
pred_fine_dict[rev_fine_dict[f]] = np.zeros((len(pred_df),))
warnings.warn("Column not found: " + f)
# Loop over coarse tags.
n_samples = len(pred_df)
coarse_dict = yaml_dict["coarse"]
for coarse_id in yaml_dict["coarse"]:
# Construct incomplete fine tag by appending -X to the coarse tag.
incomplete_tag = str(coarse_id) + "-X"
# If the incomplete tag is not in the prediction, append a column of zeros.
# This is the case e.g. for coarse ID 7 ("dogs") which has a single
# fine-level tag ("7-1_dog-barking-whining") and thus no incomplete
# tag 7-X.
if incomplete_tag not in fine_dict.keys():
pred_fine_dict[incomplete_tag] =\
np.zeros((n_samples,)).astype('int')
# Copy over the audio filename strings corresponding to each sample.
pred_fine_dict["audio_filename"] = pred_df["audio_filename"]
# Build a new Pandas DataFrame with mixed keys as column names.
pred_fine_df = pd.DataFrame.from_dict(pred_fine_dict)
# Return output in DataFrame format.
# Column names are 1-1, 1-2, 1-3 ... 1-X, 2-1, 2-2, 2-3 ... 2-X, 3-1, etc.
return pred_fine_df.sort_values('audio_filename')
def parse_ground_truth(annotation_path, yaml_path):
"""
Parse ground truth annotations from a CSV file containing both fine-level
and coarse-level predictions (and possibly additional metadata).
Returns a Pandas DataFrame in which the column names are coarse
IDs of the form 1, 2, 3 etc.
Parameters
----------
annotation_path: string
Path to the CSV file containing predictions.
yaml_path: string
Path to the YAML file containing coarse taxonomy.
Returns
-------
gt_df: DataFrame
Ground truth.
"""
# Create dictionary to parse tags
with open(yaml_path, 'r') as stream:
yaml_dict = yaml.load(stream, Loader=yaml.Loader)
# Load CSV file into a Pandas DataFrame.
ann_df = pd.read_csv(annotation_path)
# Restrict to ground truth ("annotator zero").
gt_df = ann_df[
(ann_df["annotator_id"]==0) & (ann_df["split"]=="validate")]
# Rename coarse columns.
coarse_dict = yaml_dict["coarse"]
coarse_renaming = {
"_".join([str(c), coarse_dict[c], "presence"]): str(c)
for c in coarse_dict}
gt_df = gt_df.rename(columns=coarse_renaming)
# Collect tag names as strings and map them to mixed (coarse-fine) ID pairs.
# The "mixed key" is a hyphenation of the coarse ID and fine ID.
fine_dict = {}
for coarse_id in yaml_dict["fine"]:
for fine_id in yaml_dict["fine"][coarse_id]:
mixed_key = "-".join([str(coarse_id), str(fine_id)])
fine_dict[mixed_key] = yaml_dict["fine"][coarse_id][fine_id]
# Rename fine columns.
fine_renaming = {"_".join([k, fine_dict[k], "presence"]): k
for k in fine_dict}
gt_df = gt_df.rename(columns=fine_renaming)
# Loop over coarse tags.
n_samples = len(gt_df)
coarse_dict = yaml_dict["coarse"]
for coarse_id in yaml_dict["coarse"]:
# Construct incomplete fine tag by appending -X to the coarse tag.
incomplete_tag = str(coarse_id) + "-X"
# If the incomplete tag is not in the prediction, append a column of zeros.
# This is the case e.g. for coarse ID 7 ("dogs") which has a single
# fine-level tag ("7-1_dog-barking-whining") and thus no incomplete
# tag 7-X.
if incomplete_tag not in gt_df.columns:
gt_df[incomplete_tag] = np.zeros((n_samples,)).astype('int')
# Return output in DataFrame format.
return gt_df.sort_values('audio_filename')