# Data Model Explorer

Code to illustrate structure of data model files and how to access data.


In [1]:
import json

## Load data

`segment_encodings.json` is not loaded, and is not required in the current set of analysis tools.


In [2]:
file_name = '../../model/documents_dict.json'
with open(file_name, 'r', encoding='utf-8') as f:
    documents_dict = json.load(f)
    f.close() 

file_name = '../../model/encoded_segments.json'
with open(file_name, 'r', encoding='utf-8') as f:
    encoded_segments = json.load(f)
    f.close() 

file_name = '../../model/segments_dict.json'
with open(file_name, 'r', encoding='utf-8') as f:
    segments_dict = json.load(f)
    f.close() 

file_name = '../../model/sat_segments_dict.json'
with open(file_name, 'r', encoding='utf-8') as f:
    sat_segments_dict = json.load(f)
    f.close() 
    

## Documents

In this application documents are national constitutions.


In [3]:
# First 10 document IDs in dictionary
print(list(documents_dict.keys())[0:10])
print()


doc_id = 'Slovenia_2016'

print(documents_dict[doc_id])
print()

# Get the year of the document
print(documents_dict[doc_id]['year_enacted'])
print()

# Get all the segment identifiers for this document - only show count and first 10
segment_ids = [segment_id for segment_id,_ in segments_dict.items() if segment_id.split('/')[0]==doc_id]
print('Total number of segments in document:',len(segment_ids))
print(segment_ids[0:10])
print()

# Get encoded segments - could be smaller than total number of segments in document
encoded_ids = [segment_id for segment_id in encoded_segments if segment_id.split('/')[0]==doc_id]
print('Total number of encoded segments in document:',len(encoded_ids))
print()



['Slovenia_2016', 'Burkina_Faso_2015', 'Chad_2018', 'Kosovo_2016', 'Fiji_2013', 'Serbia_2006', 'Azerbaijan_2016', 'Tuvalu_2010', 'Turkey_2017', 'Philippines_1987']

{'name': 'Slovenia_2016', 'region': 'Europe', 'year_enacted': '1991'}

1991

Total number of segments in document: 382
['Slovenia_2016/2', 'Slovenia_2016/5', 'Slovenia_2016/7', 'Slovenia_2016/9', 'Slovenia_2016/10', 'Slovenia_2016/12', 'Slovenia_2016/13', 'Slovenia_2016/14', 'Slovenia_2016/15', 'Slovenia_2016/17']

Total number of encoded segments in document: 382



## Segments

Segments are sections in constitution text.


In [4]:
# First 10 segment IDs in dictionary
print(list(segments_dict.keys())[0:10])
print()

# How many segments in total
print('Total segments in corpus:',len(segments_dict.keys()))
print()

# How many encoded
print('Total encoded segments in corpus:',len(encoded_segments))
print()

segment_id = 'Slovenia_2016/2'

# Clean text
print(segments_dict[segment_id]['text'])
print()

# Get segment's document data
doc_name = documents_dict[segment_id.split('/')[0]]['name']
print(doc_name)
print()



['Slovenia_2016/2', 'Slovenia_2016/5', 'Slovenia_2016/7', 'Slovenia_2016/9', 'Slovenia_2016/10', 'Slovenia_2016/12', 'Slovenia_2016/13', 'Slovenia_2016/14', 'Slovenia_2016/15', 'Slovenia_2016/17']

Total segments in corpus: 163596

Total encoded segments in corpus: 163596

Proceeding from the Basic Constitutional Charter on the Sovereignty and Independence of the Republic of Slovenia, and from fundamental human rights and freedoms, and the fundamental and permanent right of the Slovene nation to self-determination; and from the historical fact that in a centuries-long struggle for national liberation we Slovenes have established our national identity and asserted our statehood, the Assembly of the Republic of Slovenia hereby adopts

Slovenia_2016



## SAT segments

SAT means segments as topic. A SAT comprise a set of segments(above) that have been tagged by a human with a topic. Topics are the key.


In [5]:

# How many topics
print(len(sat_segments_dict.keys()))
print()

# First 10 topics
print(list(sat_segments_dict.keys())[0:10])
print()

# Segment IDs for a SAT
sat_segment_ids = sat_segments_dict['equalgr5']
print(sat_segment_ids)
print()


# Recover the segments text from the IDs
for segment_id in sat_segment_ids:
    print(segment_id,segments_dict[segment_id]['text'])
    print()


329

['referen', 'flag', 'anthem', 'seprel', 'freerel', 'equalgr5', 'equalgr11', 'equalgr4', 'equalgr15', 'equalgr13']

['Slovenia_2016/43', 'Burkina_Faso_2015/24', 'Kosovo_2016/161', 'Fiji_2013/590', 'Serbia_2006/75', 'Turkey_2017/46', 'Kenya_2010/380', 'Kenya_2010/382', 'Rwanda_2015/89', 'Morocco_2011/14', 'Lesotho_2018/18', 'Lithuania_2019/104', 'Ecuador_2021/89', 'Marshall_Islands_1995/145', 'Cote_DIvoire_2016/41', 'Djibouti_2010/19', 'Burundi_2018/78', 'Finland_2011/22', 'Nigeria_2011/299', 'Kyrgyz_Republic_2016/156', 'Bhutan_2008/281', 'Bhutan_2008/367', 'Bhutan_2008/369', 'Czech_Republic_2013/845', 'German_Federal_Republic_2014/21', 'Vanuatu_2013/31', 'South_Sudan_2013/1868', 'Kazakhstan_2017/101', 'Tajikistan_2016/77', 'South_Africa_2012/100', 'Sweden_2012/11', 'Samoa_2017/164', 'Croatia_2013/83', 'Moldova_2016/93', 'Nicaragua_2014/134', 'Switzerland_2014/46', 'Albania_2016/119', 'Somalia_2012/85', 'Somalia_2012/89', 'Russia_2014/123', 'Mongolia_2001/110', 'Colombia_2015/35', '

In [None]:

# Path to the directory where SAT similarity matrices are stored
sats_dir = '../../model/sats'  # Update the path if necessary

# List all files in the sats directory (assuming they are in .json format)
files = [f for f in os.listdir(sats_dir) if f.endswith('.json')]

# Dictionary to store semantic similarity matrices for each SAT
similarity_matrices = {}

# Load each SAT's similarity matrix from the corresponding JSON file
for file in files:
    sat_name = file.replace('.json', '')  # Extract SAT name from the filename
    file_path = os.path.join(sats_dir, file)

    with open(file_path, 'r') as f:
        similarity_matrix = json.load(f)
        similarity_matrices[sat_name] = similarity_matrix

# Display the structure of the loaded similarity matrices
print("Sample structure of similarity matrices:")

# Display the first few SATs to inspect the structure of their similarity matrices
for idx, (sat_name, similarity_matrix) in enumerate(similarity_matrices.items()):
    if idx < 5:  # Show the first 5 SAT similarity matrices
        print(f"SAT: {sat_name}")
        print(f"Similarity Matrix (shape: {len(similarity_matrix)}x{len(similarity_matrix[0])}):")
        print(np.array(similarity_matrix))  # Convert to NumPy array for easy viewing
        print("-" * 50)

# If you'd like to visualize a specific similarity matrix, you can use a heatmap
# Let's visualize the first SAT's similarity matrix (if it exists)

if len(similarity_matrices) > 0:
    first_sat_name = list(similarity_matrices.keys())[0]  # Get the first SAT
    first_similarity_matrix = similarity_matrices[first_sat_name]
    
    # Plot the similarity matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(first_similarity_matrix, cmap='viridis', interpolation='nearest')
    plt.colorbar()
    plt.title(f"Semantic Similarity Matrix for SAT: {first_sat_name}")
    plt.xlabel("Sentence Index")
    plt.ylabel("Sentence Index")
    plt.show()

In [None]:
# Function to load and display JSON files
def display_json(file_path, num_entries=5):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    print(f"Preview of {file_path}:")
    if isinstance(data, dict):
        for i, (key, value) in enumerate(data.items()):
            print(f"{key}: {value}")
            if i + 1 >= num_entries:
                break
    elif isinstance(data, list):
        for i, item in enumerate(data):
            print(f"{i}: {item}")
            if i + 1 >= num_entries:
                break
    print("\n")

# File paths (update these if needed)
documents_dict_path = '../../model/documents_dict.json'
encoded_segments_path = '../../model/encoded_segments.json'
segment_encodings_path = '../../model/segment_encodings.json'
sat_segments_dict_path = '../../model/sat_segments_dict.json'
segments_dict_path = '../../model/segments_dict.json'
bb = '../../model/SBERT_segment_encodings.json'

# Displaying previews of the data files
#display_json(documents_dict_path)
#display_json(encoded_segments_path)
#display_json(segment_encodings_path)
#display_json(sat_segments_dict_path)
display_json(segments_dict_path)
display_json(bb)