/
prepare_data.py
102 lines (85 loc) · 3.33 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
G & T - embeddinG&Training data, a package for crowdsourcing image labels
Copyright (C) 2020 Popsa.
Author: Łukasz Kopeć
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
# Create manifest file (used by Ground Truth to know which images to display) -
# see README for more details.
import json
import os
def prepare_manifest(filename_list, positive_examples, negative_examples,
event_names, other_category_label='Other', group_size=20):
"""
Prepare a manifest from list of filenames. Chunk into groups with roughly
`group_size` members.
Parameters
----------
filename_list: Iterable
List of S3 URIs with files to be labelled
positive_examples: list(str)
List of S3 URIs with positive examples of categories
negative_examples: list(str)
List of S3 URIs with negative examples of categories
event_names: list(str)
List of labels for different event types that a labeller will choose
from
other_category_label: str
The title for the "other" category (usually 'Other' or
'None of the above')
group_size: int
How many images to be labelled at once
Returns
-------
manifest: list(dict)
The manifest file containing images to label and the categories, ready
to be written by `write_manifest_file`
"""
# Number of tasks to create
task_count = ((len(filename_list) // group_size) +
(1 if len(filename_list) % group_size > 0 else 0))
manifest = [{'source': []} for _ in range(task_count)]
for idx, record in enumerate(filename_list):
group = manifest[idx % task_count]
group['source'].append({
'source-ref': record,
'index': idx,
})
# because ground truth won't accept a JSON value for source or source-ref
for record in manifest:
additional_data = {
'event_names': event_names,
'positive_examples': positive_examples,
'negative_examples': negative_examples,
'other_category_label': other_category_label,
}
additional_data.update(record['source'][0])
record['source'][0] = additional_data
record['source'] = json.dumps(record['source'])
return manifest
def write_manifest_file(manifest, fname, output_dir='../data/processed/'):
"""
Write out the manifest file to a given filename
Parameters
----------
manifest: Iterable of dict
The contents of the manifest
fname: str
Filename
output_dir: str, default: '../data/processed/'
The output directory for writing the manifest file
Returns
-------
None
"""
with open(os.path.join(output_dir, fname), 'w+') as f:
f.writelines([json.dumps(x) + '\n' for x in manifest])