/
get_green_line_branch_info.py
92 lines (80 loc) · 4.03 KB
/
get_green_line_branch_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Inserts hand-crafted JSON representing the branch information for the Green Line
T branches. Includes neighboring stops inbound and outbound, as well as a nice
readable name.
"""
import urllib.request
import json
import dml
import prov.model
import datetime
import uuid
def main():
teamname = 'ciestu12_sajarvis'
# Set up the database connection.
client = dml.pymongo.MongoClient()
repo = client.repo
repo.authenticate(teamname, teamname)
startTime = datetime.datetime.now()
# Not using an API here because one did not exist. Created from a subway map
# published by the MBTA.
url = 'http://cs-people.bu.edu/sajarvis/datamech/green_line_branch_info.json'
response = urllib.request.urlopen(url).read().decode("utf-8")
r = json.loads(response)
print(json.dumps(r, sort_keys=True, indent=2))
collection = 't_branch_info'
repo.dropPermanent(collection)
repo.createPermanent(collection)
repo['{}.{}'.format(teamname, collection)].insert_many(r)
endTime = datetime.datetime.now()
doc = create_prov(startTime, endTime)
repo.record(doc.serialize()) # Record the provenance document.
print(doc.get_provn())
repo.logout()
def create_prov(startTime, endTime):
# Create provenance data and recording
doc = prov.model.ProvDocument()
doc.add_namespace('alg', 'http://datamechanics.io/algorithm/ciestu12_sajarvis/') # The scripts in <folder>/<filename> format.
doc.add_namespace('dat', 'http://datamechanics.io/data/ciestu12_sajarvis/') # The data sets in <user>/<collection> format.
doc.add_namespace('ont', 'http://datamechanics.io/ontology#') # 'Extension', 'DataResource', 'DataSet', 'Retrieval', 'Query', or 'Computation'.
doc.add_namespace('log', 'http://datamechanics.io/log#') # The event log.
doc.add_namespace('bu', 'http://cs-people.bu.edu/sajarvis/datamech/')
doc.add_namespace('mbta_img', 'http://www.mbta.com/images/')
# This run has an agent (the script), an entity (the source), and an activity (execution)
this_script = doc.agent('alg:get_green_line_branch_info', {prov.model.PROV_TYPE:prov.model.PROV['SoftwareAgent'], 'ont:Extension':'py'})
resource = doc.entity('bu:green_line_branch_info',
{
'prov:label':'Green Line Branch Info',
prov.model.PROV_TYPE:'ont:DataResource',
'ont:Extension':'json'})
this_run = doc.activity('log:a'+str(uuid.uuid4()),
startTime, endTime)
doc.wasAssociatedWith(this_run, this_script)
doc.usage(this_run, resource, startTime, None,
{
prov.model.PROV_TYPE:'ont:Retrieval', 'ont:Query':''})
# The original sources are entities, too. A map of lines from the MBTA and
# stop IDs from the GPS file. These are the sources used for the
# handcrafted data.
map_resource = doc.entity('mbta_img:subway-spider',
{
'prov:label':'Map of T Lines and Stops',
prov.model.PROV_TYPE:'ont:DataResource',
'ont:Extension':'jpg'})
doc.wasDerivedFrom(resource, map_resource, this_run, this_run, this_run)
stop_resource = doc.entity('dat:t_stop_locations',
{
'prov:label':'Collection with T Stop IDs',
prov.model.PROV_TYPE:'ont:DataSet'})
doc.wasDerivedFrom(resource, stop_resource, this_run, this_run, this_run)
# Now define entity for the dataset we obtained.
branch_stops = doc.entity('dat:t_branch_info',
{
prov.model.PROV_LABEL:'Green Line Branch Info',
prov.model.PROV_TYPE:'ont:DataSet'})
doc.wasAttributedTo(branch_stops, this_script)
doc.wasGeneratedBy(branch_stops, this_run, endTime)
doc.wasDerivedFrom(branch_stops, resource, this_run, this_run, this_run)
return doc
if __name__ == '__main__':
main()