# Data Model Explorer

Code to illustrate structure of data model files and how to access data.


In [None]:
import json

## Load data

`segment_encodings.json` is not loaded, and is not required in the current set of analysis tools.


In [None]:
file_name = '../../model/documents_dict.json'
with open(file_name, 'r', encoding='utf-8') as f:
    documents_dict = json.load(f)
    f.close() 

file_name = '../../model/encoded_segments.json'
with open(file_name, 'r', encoding='utf-8') as f:
    encoded_segments = json.load(f)
    f.close() 

file_name = '../../model/segments_dict.json'
with open(file_name, 'r', encoding='utf-8') as f:
    segments_dict = json.load(f)
    f.close() 

file_name = '../../model/sat_segments_dict.json'
with open(file_name, 'r', encoding='utf-8') as f:
    sat_segments_dict = json.load(f)
    f.close() 
    

## Documents

In this application documents are national constitutions.


In [None]:
# First 10 document IDs in dictionary
print(list(documents_dict.keys())[0:10])
print()


doc_id = 'Slovenia_2016'

print(documents_dict[doc_id])
print()

# Get the year of the document
print(documents_dict[doc_id]['year_enacted'])
print()

# Get all the segment identifiers for this document - only show count and first 10
segment_ids = [segment_id for segment_id,_ in segments_dict.items() if segment_id.split('/')[0]==doc_id]
print('Total number of segments in document:',len(segment_ids))
print(segment_ids[0:10])
print()

# Get encoded segments - could be smaller than total number of segments in document
encoded_ids = [segment_id for segment_id in encoded_segments if segment_id.split('/')[0]==doc_id]
print('Total number of encoded segments in document:',len(encoded_ids))
print()



## Segments

Segments are sections in constitution text.


In [None]:
# First 10 segment IDs in dictionary
print(list(segments_dict.keys())[0:10])
print()

# How many segments in total
print('Total segments in corpus:',len(segments_dict.keys()))
print()

# How many encoded
print('Total encoded segments in corpus:',len(encoded_segments))
print()

segment_id = 'Slovenia_2016/2'

# Clean text
print(segments_dict[segment_id]['text'])
print()

# Get segment's document data
doc_name = documents_dict[segment_id.split('/')[0]]['name']
print(doc_name)
print()



## SAT segments

SAT means segments as topic. A SAT comprise a set of segments(above) that have been tagged by a human with a topic. Topics are the key.


In [None]:

# How many topics
print(len(sat_segments_dict.keys()))
print()

# First 10 topics
print(list(sat_segments_dict.keys())[0:10])
print()

# Segment IDs for a SAT
sat_segment_ids = sat_segments_dict['equalgr5']
print(sat_segment_ids)
print()


# Recover the segments text from the IDs
for segment_id in sat_segment_ids:
    print(segment_id,segments_dict[segment_id]['text'])
    print()
