/
processDocuments.py
35 lines (26 loc) · 1.28 KB
/
processDocuments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import sys
from ThreadedDocumentProcessor import ThreadedDocumentProcessor
class DocumentProcessor(ThreadedDocumentProcessor):
def __init__(self, collection, number_of_threads, query):
super().__init__(collection, number_of_threads, query, DocumentProcessor.processDocument)
def processDocument(self, document):
'''
This is the function that will be called by each thread on each document.
If this function returns something, it must be a dictionary.
Said dictionary will be written in JSON format to the output.json file.
Update this function to perform whatever actions you need to on each document.
'''
state_name = document['properties']['NAME']
state_code = document['properties']['STUSPS']
return {state_name: state_code}
def main(collection, number_of_threads):
query = {} # Update the `query` field to specify a mongo query
documentProcessor = DocumentProcessor(collection, number_of_threads, query)
documentProcessor.run()
if __name__ == '__main__':
if len(sys.argv) == 3:
collection = sys.argv[1]
number_of_threads = int(sys.argv[2])
main(collection, number_of_threads)
else:
print(f'Invalid args. Check the `README.md` file for program usage')