In [1]:
import pandas as pd
import numpy as np
import json
import copy
import uuid

In [2]:
source_file_statements = "source_data/DDB Sellan Coding source.xlsx"
source_file_actants = "source_data/DDB Mockup ACTANTS (entities Person, Concepts, Locations, Objects from Sellan).xlsx"

In [3]:
with open('statement-json-structure-example.json') as f:
    data = json.load(f)

# Output: {'name': 'Bob', 'languages': ['English', 'Fench']}
print(data)

{'id': '1', 'text': 'David, strong friend of Peter, saw Adam in Brno. ', 'tree': {'id': '101', '_label': 'A: saw (A090)', 'type': 'root', 'actants': [{'id': 'P201', '_label': 'P: David (P201)', 'type': 'subject', 'props': [{'id': '501', '_label': 'A:has property', 'type': 'property', 'actants': [{'id': 'C601', '_label': 'C: friend_of (C601)', 'type': 'actant1', 'props': [{'id': '501', '_label': 'A:has property', 'type': 'property', 'actants': [{'id': 'C801', '_label': 'C: intensity (C801)', 'type': 'actant1', 'props': []}, {'id': 'C901', '_label': 'C: strong (C901)', 'type': 'actant2', 'props': []}]}]}, {'id': 'P701', '_label': 'P: Peter (P701)', 'type': 'actant2', 'props': []}], 'props': []}]}, {'id': 'P301', '_label': 'P: Adam (P301)', 'type': 'actant1', 'props': []}], 'props': [{'id': '401', '_label': 'A: has property', 'type': 'property', 'actants': [{'id': 'C201', '_label': 'C: location_where (C201)', 'type': 'actant1', 'props': []}, {'id': 'L301', '_label': 'L: Brno (L301)', 'typ

In [4]:
statement_template = {
    "uuid": "",
    "_source_id": "", # e.g. T3-1-01-013 if applicable
    "action_id": "", # A0090 (saw)
    "certainty": "",
    "epistemic_level": "",
    "meta": "",
    "modality": "",
    "tree": {},
    "note": "",
    "resources": [],
    "tags": [],
    "territory": "",
    "text": "David, strong friend of Peter, saw Adam in Brno.",
    "editor":"",
    "primary_reference_part": "",
    "secondary_reference_id": "",
    "secondary_reference_part": "",
    "secondary_reference_id": ""
}

node_tree_root_template = {
    "uuid": "",
     "_label": "",  
    "type": "root",
    "actants": [],
    "props": []   
}

has_property_template = {  # A:has property
     "uuid": "",
     "_label": "A: has property",
    "type": "property",
    "actants": [],
    "props": []  
}

actant_template = {  # entity
     "uuid": "",
     "_label": "", #P Person
    "type": "",  # subject,  actant1, actant2
    "props": []  
}


In [5]:
# Pretty Printing JSON string back
print(json.dumps(data, indent = 4, sort_keys=True))

{
    "action_id": "A0090",
    "certainty": 100,
    "epistemic_level": 1,
    "id": "1",
    "meta": "",
    "modality": 1,
    "note": "",
    "resources": [],
    "tags": [],
    "territory_id": "TE1",
    "text": "David, strong friend of Peter, saw Adam in Brno. ",
    "tree": {
        "_label": "A: saw (A090)",
        "actants": [
            {
                "_label": "P: David (P201)",
                "id": "P201",
                "props": [
                    {
                        "_label": "A:has property",
                        "actants": [
                            {
                                "_label": "C: friend_of (C601)",
                                "id": "C601",
                                "props": [
                                    {
                                        "_label": "A:has property",
                                        "actants": [
                                            {
                                           

In [6]:
#read files
df_s = pd.read_excel(source_file_statements)
df_a = pd.read_excel(source_file_actants)


#clean statement dataframe
df_s = df_s.replace(np.nan, '', regex=True)  #rempve nans
df_s = df_s.astype(str) #cast all columns as string

In [7]:
#creating dataset of territories
# - territories are like folders, they are nested in each other
# - each statement can be in exactly one territory 

#examples "T3", "T3-1", "T3-1-00"

#in present import - the territory is just simply parsed from "id"

#for key, row in df_s.iterrows():
#    print(row['id'])
    



In [8]:
#generate uuid for actants
def gen_uuid(row):
    return str(uuid.uuid4())
    
df_a['uuid'] = df_a.apply(lambda row: gen_uuid(row),axis=1) 

In [9]:
def create_object(template_dict):
    o = copy.deepcopy(template_dict)
    o['uuid'] = str(uuid.uuid4())
    return o

def create_statement(source_id=''):
    o = copy.deepcopy(statement_template)
    o['_source_id'] = source_id
    o['uuid'] = str(uuid.uuid4())
    return o



def create_actant(atype = 'subject', source_id = ''):
    o = copy.deepcopy(actant_template)
    #search for actant in master dataframe  df_a
    search =  df_a.loc[df_a['codeid'] == source_id]
    uuid = search['uuid']

    if uuid.empty:
        return False
    
    o['type'] = atype
    o['uuid'] = str(uuid.item())
    
    return o

In [10]:
s = create_statement('source_id')
print(s)

a = create_actant('subject','P0004')
print(a)

{'uuid': 'd95ab859-4f33-4db0-a7df-92001d00baaa', '_source_id': 'source_id', 'action_id': '', 'certainty': '', 'epistemic_level': '', 'meta': '', 'modality': '', 'tree': {}, 'note': '', 'resources': [], 'tags': [], 'territory': '', 'text': 'David, strong friend of Peter, saw Adam in Brno.', 'editor': '', 'primary_reference_part': '', 'secondary_reference_id': '', 'secondary_reference_part': ''}
{'uuid': '67895c30-baee-4841-abbd-b43a1c0635a8', '_label': '', 'type': 'subject', 'props': []}


In [11]:
statements = []  # list objectu statments

#idiot simple structure which ignores almost any complexity
#  1) does not work with multiples
#  2) needs C or P in id_subject


##############################################################
i = 0
for key, row in df_s.iterrows():
    i += 1
    if (i>5):
        #break
        pass
    
    #basic 'inteligence'
    if ('P' not in row['id_subject'] and 'C' not in row['id_subject']):
        print ('Skipping '+  row['id'] + " : strange id_subject "+row['id_subject'])
        continue
    
    
    #cretate statement object
    s = create_statement(statement_template)
    s['_source_id'] = row['id']
    s['action_id'] = row['id_action_or_relation']
    s['epistemic_level'] = row['epistemological_level']
    s['modality'] = row['modality']
    s['text'] = row['text'].strip()
    s['territory'] = row['text_part_id']
    
    s["primary_reference_part"] = row["primary_reference_part"]
    s["secondary_reference_id"] = row["secondary_reference_id"]
    s["secondary_reference_part"] = row["secondary_reference_part"]
    s["secondary_reference_id"] = row["secondary_reference_id"]
    
    
    #create root action object
    ra = create_object(node_tree_root_template)
    #print(ra['uuid'])
    
    #create subject
    su = create_actant('subject',row['id_subject'])
    if su:
        su['_label'] = row['id_subject']
        ra['actants'].append(su)
    else:
        print('Subject '+row['id_subject']+" not recognized.")
    
    
    #create actant1
    if (row['id_actant1']):    
        a1 = create_actant('actant1',row['id_actant1'])
        if a1:
            a1['_label'] = row['id_actant1']
            ra['actants'].append(a1)
    else:
        #print('Empty a1')
        pass
    
    #create actant2
    if (row['id_actant2']):    
        a2 = create_actant('actant2',row['id_actant2'])
        if (a2):
            a2['_label'] = row['id_actant2']
            ra['actants'].append(a2)
    else:
        #print('Empty a2')
        pass
        
    s['tree'] = ra
    statements.append(s)

Skipping T3-1-00-001 : strange id_subject T3-1
Skipping T3-1-00-002 : strange id_subject T3-1
Subject P0356 #P0770 not recognized.
Skipping T3-1-00-003 : strange id_subject T3-1
Skipping T3-1-00-004 : strange id_subject T3-1
Skipping T3-1-00-005 : strange id_subject T3-1
Skipping T3-1-9918-010 : strange id_subject T3-1-9918-008
Skipping T3-1-9918-011 : strange id_subject T3-1-9918-009
Skipping T3-1-9919-009 : strange id_subject T3-1-9919-006
Skipping T3-1-9919-010 : strange id_subject T3-1-9919-007
Skipping T3-1-9919-011 : strange id_subject T3-1-9919-008
Skipping T3-1-9920-007 : strange id_subject T3-1-9920-003
Skipping T3-1-9920-008 : strange id_subject T3-1-9920-004
Skipping T3-1-9920-009 : strange id_subject T3-1-9920-005
Skipping T3-1-9921-016 : strange id_subject T3-1-9921-007
Skipping T3-1-9921-017 : strange id_subject T3-1-9921-008
Skipping T3-1-9921-018 : strange id_subject T3-1-9921-009
Skipping T3-1-9921-019 : strange id_subject T3-1-9921-010
Skipping T3-1-9921-020 : strange

Skipping T3-1-9965-009 : strange id_subject T3-1-9965-006
Skipping T3-1-9965-010 : strange id_subject T3-1-9965-007
Skipping T3-1-9965-011 : strange id_subject T3-1-9965-008
Skipping T3-1-9966-005 : strange id_subject T3-1-9966-004
Skipping T3-1-9967-005 : strange id_subject T3-1-9967-004
Skipping T3-1-9967-008 : strange id_subject T3-1-9967-005
Skipping T3-1-9967-009 : strange id_subject T3-1-9967-006
Skipping T3-1-9967-010 : strange id_subject T3-1-9967-007
Skipping T3-1-9968-013 : strange id_subject T3-1-9968-009
Skipping T3-1-9968-014 : strange id_subject T3-1-9968-010
Skipping T3-1-9968-015 : strange id_subject T3-1-9968-011
Skipping T3-1-9968-016 : strange id_subject T3-1-9968-012
Skipping T3-1-9969-014 : strange id_subject T3-1-9969-012
Skipping T3-1-9969-027 : strange id_subject T3-1-9969-018
Skipping T3-1-9969-028 : strange id_subject T3-1-9969-019
Skipping T3-1-9969-029 : strange id_subject T3-1-9969-020
Skipping T3-1-9969-030 : strange id_subject T3-1-9969-021
Skipping T3-1-

In [12]:
statements

[{'uuid': 'b02adf94-5a97-41a6-9d41-3b5ff0153dc6',
  '_source_id': 'T3-1-01-001',
  'action_id': 'A0164',
  'certainty': '',
  'epistemic_level': '2-interpretive',
  'meta': '',
  'modality': '',
  'tree': {'uuid': 'bf8d2b01-a379-4f54-b117-7b704aa15579',
   '_label': '',
   'type': 'root',
   'actants': [{'uuid': 'f858969b-fd3a-496c-a81a-7201b1632b03',
     '_label': 'P0454',
     'type': 'subject',
     'props': []}],
   'props': []},
  'note': '',
  'resources': [],
  'tags': [],
  'territory': 'T3-1-01',
  'text': '',
  'editor': '',
  'primary_reference_part': '30.0',
  'secondary_reference_id': '',
  'secondary_reference_part': '185v'},
 {'uuid': '96464fd2-1266-408c-a170-c55995132476',
  '_source_id': 'T3-1-01-002',
  'action_id': 'A0093',
  'certainty': '',
  'epistemic_level': '2-interpretive',
  'meta': '',
  'modality': '',
  'tree': {'uuid': '5875923b-0d58-48cf-99e3-8668e385d7ef',
   '_label': '',
   'type': 'root',
   'actants': [{'uuid': 'f858969b-fd3a-496c-a81a-7201b1632b03

In [None]:
df_a.to_excel('sellan-actants.xlsx')
df_a.to_csv('sellan-actants.csv')


statements_json = json.dumps(statements)
print(statements_json)

with open('sellan-statements.json', 'w') as file:
    file.write(statements_json)