This repository has been archived by the owner on Dec 8, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
state.py
executable file
·89 lines (71 loc) · 2.28 KB
/
state.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
###
#
# Expects data from 18f/foia's state scraper at a data/state dir.
#
# options:
# limit: cut off after X documents
#
# localhost:9200 by default
# uses index 'foia', to load 'documents' mapping run
# curl -XPUT "http://localhost:9200/foia/_mapping/documents" -d "@config/mappings/documents.json"
#
###
# system dependencies
import utils
import glob
import os
import json
# third party dependencies
from elasticsearch import Elasticsearch
es = Elasticsearch()
index = 'foia'
mapping = 'documents'
def run(options):
limit = options.get("limit")
start = options.get("start")
i = 0
missing = 0
doc_paths = glob.glob("data/state/*/*")
doc_paths.sort()
if start:
doc_paths = doc_paths[(int(start)-1):-1]
if limit:
doc_paths = doc_paths[0:(int(limit))]
# each one is e.g. 'data/state/0139/DOCUMENTS-StateChile3-00008305'
for doc_path in doc_paths:
json_path = os.path.join(doc_path, "document.json")
metadata = json.load(open(json_path))
# RSS-type fields that might easily be common across agencies
document_id = metadata["document_id"]
document = {
"document_id": document_id,
"url": metadata['url'],
"title": metadata['subject'],
"source": "state"
}
# State docs don't always have a postedDate - research should be done
# to identify the correct posted date for individual tranches, but
# for now, just fall back to its creation date.
document["published_on"] = metadata['postedDate'] or metadata['docDate']
# include anything else as State-specific extra data
document["state"] = metadata
# the full text
text_path = os.path.join(doc_path, "document.txt")
if os.path.exists(text_path):
text = open(text_path).read()
document["text"] = text
else:
print("[%s] NO TEXT ON DISK." % document_id)
missing += 1
# throw it in elasticsearch
print("[%i][%s] Loading into elasticsearch." % (i, document_id))
es.index(index=index, doc_type=mapping, id=document_id, body=document)
i += 1
if limit and (i > int(limit)):
break
print("Okay, loaded %i documents into Elasticsearch." % i)
print("Missing text on disk: %i" % missing)
# reload the index
es.indices.refresh()
run(utils.options()) if (__name__ == "__main__") else None