In [6]:
import pandas as pd
import numpy as np
import json
import copy
import uuid

In [7]:
#Sellan data
source_file_statements = "source_data/DDB Sellan Coding source.xlsx"
source_file_actants = "source_data/DDB Mockup ACTANTS (entities Person, Concepts, Locations, Objects from Sellan).xlsx"

In [9]:
statement_template = {
    "id": "",
    "_source_id": "", # e.g. T3-1-01-013 if applicable
    "actionId": "", # A0090 (saw)
    "certainty": 0,
    "epistemicLevel": 0,
    "meta": "",
    "modality": 0,
    "tree": {},
    "note": "",
    "resources": [],
    "tags": [],
    "territoryId": "",
    "text": "David, strong friend of Peter, saw Adam in Brno."
    #"editor":"",
    #"primary_reference_part": "",
    #"secondary_reference_id": "",
    #"secondary_reference_part": "",
    #"secondary_reference_id": ""
}

node_tree_root_template = {
    "id": "",
     "_label": "",  
    "type": "root",
    "actants": [],
    "props": []   
}

has_property_template = {  # A:has property
     "id": "",
     "_label": "A: has property",
    "type": "property",
    "actants": [],
    "props": []  
}

actant_template = {  # entity
     "id": "",
     "_label": "", #P Person
    "type": "",  # subject,  actant1, actant2
    "props": []  
}


In [10]:
#read files
df_s = pd.read_excel(source_file_statements)
df_a = pd.read_excel(source_file_actants)


#clean statement dataframe
df_s = df_s.replace(np.nan, '', regex=True)  #rempve nans
df_s = df_s.astype(str) #cast all columns as string

In [11]:
#creating dataset of territories
#in present import - the territory is just simply text_part_id



# - territories are like folders, they are nested in each other
# - each statement can be in exactly one territory 
#for key, row in df_s.iterrows():
#    print(row['id']

In [12]:
#generate uuid for actants
def gen_uuid(row):
    return str(uuid.uuid4())
    
df_a['uuid'] = df_a.apply(lambda row: gen_uuid(row),axis=1) 

In [17]:
def create_object(template_dict):
    o = copy.deepcopy(template_dict)
    o['id'] = str(uuid.uuid4())
    return o

def create_statement(source_id=''):
    o = copy.deepcopy(statement_template)
    o['_source_id'] = source_id
    o['id'] = str(uuid.uuid4())
    return o


#simple
def create_actant(atype = 'subject', source_id = ''):
    o = copy.deepcopy(actant_template)
    #search for actant in master dataframe  df_a
    search =  df_a.loc[df_a['codeid'] == source_id]
    uuid = search['uuid']

    if uuid.empty:
        return False
    
    o['type'] = atype
    o['id'] = str(uuid.item())
    
    return o

#works with multiples
def create_actant_multiple(atype = 'subject', source_id = ''):
    
    actanct_ids = source_id.split("#")
    actancts = []
    
    for item in actanct_ids:
        o = copy.deepcopy(actant_template)
        search =  df_a.loc[df_a['codeid'] == item.strip()]  #search for actant in master dataframe  df_a
        uuid = search['uuid']
        if uuid.empty:
            return False

        o['_label'] = item.strip()
        o['type'] = atype
        o['id'] = str(uuid.item())
        actancts.append(o)
    
    return actancts




def returnTerritoryId(text_part):
    return text_part

In [27]:
str(uuid.uuid4())

'd411e27f-d5a7-4aed-9858-9d9f3c66e34f'

In [10]:
s = create_statement('aaaaaaa')
print(s)

a = create_actant('subject','P0004')
print(a)



{'uuid': 'd95ab859-4f33-4db0-a7df-92001d00baaa', '_source_id': 'source_id', 'actionId': '', 'certainty': '', 'epistemicLevel': '', 'meta': '', 'modality': '', 'tree': {}, 'note': '', 'resources': [], 'tags': [], 'territory': '', 'text': 'David, strong friend of Peter, saw Adam in Brno.', 'editor': '', 'primary_reference_part': '', 'secondary_reference_id': '', 'secondary_reference_part': ''}
{'uuid': '67895c30-baee-4841-abbd-b43a1c0635a8', '_label': '', 'type': 'subject', 'props': []}


In [18]:
statements = []  # list objectu statments

#idiot simple structure which ignores almost any complexity
#  1) does not work with multiples
#  2) needs C or P in id_subject


##############################################################
i = 0
for key, row in df_s.iterrows():
    i += 1
    if (i>5):
        #break
        pass
    
    #basic 'inteligence'
    if ('P' not in row['id_subject'] and 'C' not in row['id_subject']):
        print ('Skipping '+  row['id'] + " : strange id_subject "+row['id_subject'])
        continue
    
    
    #cretate statement object
    s = create_statement(statement_template)
    s['_source_id'] = row['id']
    s['actionId'] = row['id_action_or_relation']
    s['epistemicLevel'] = int(row['epistemological_level'].split('-')[0]) #konverze 2-interpretative na 2
    s['modality'] = row['modality']
    s['certainty'] =  row['certainty']
    s['text'] = row['text'].strip()
    s['territoryId'] = row['text_part_id']  #nevraci uuid, ale label "T3-1-64", nicmene by to nemelo vadit
    
    #s["primary_reference_part"] = row["primary_reference_part"]
    #s["secondary_reference_id"] = row["secondary_reference_id"]
    #s["secondary_reference_part"] = row["secondary_reference_part"]
    #s["secondary_reference_id"] = row["secondary_reference_id"]
    
    
    #create root action object
    ra = create_object(node_tree_root_template)
    #print(ra['uuid'])
    
    #create subject
    su = create_actant_multiple('subject',row['id_subject'])
    if su:
        ra['actants'].extend(su)
    else:
        print('Subject '+row['id_subject']+" not recognized.")
    
    
    #create actant1
    if (row['id_actant1']):    
        a1 = create_actant_multiple('actant1',row['id_actant1'])
        if a1:
            ra['actants'].extend(a1)
    else:
        #print('Empty a1')
        pass
    
    #create actant2
    if (row['id_actant2']):    
        a2 = create_actant_multiple('actant2',row['id_actant2'])
        if (a2):
            ra['actants'].extend(a2)
    else:
        #print('Empty a2')
        pass
    
    #specialni pripady,  konstrukce has_properties vazeb u lokaci
    ############################################################
    #C0223  location: where
    #C9224	location: to	CONCEPT
    #C9225	location: from	CONCEPT  : v Sellanu neexistuje

    
    
    #id_location  - understood as location_where   C0223
    # : nove has propertion, ktere ma jako subjekt root akci
    if (row['id_location']):
        
        pass
    
    #id_location)_to - understood as location_where   C0224
    # : nove has propertion, ktere ma jako subjekt concept v id_actant2 root akci
    if ("C" in row['id_actant2'] and row['id_location_to']):
        pass
    
    
    
    s['tree'] = ra
    statements.append(s)

Skipping T3-1-00-001 : strange id_subject T3-1
Skipping T3-1-00-002 : strange id_subject T3-1
Skipping T3-1-00-003 : strange id_subject T3-1
Skipping T3-1-00-004 : strange id_subject T3-1
Skipping T3-1-00-005 : strange id_subject T3-1
Skipping T3-1-9918-010 : strange id_subject T3-1-9918-008
Skipping T3-1-9918-011 : strange id_subject T3-1-9918-009
Skipping T3-1-9919-009 : strange id_subject T3-1-9919-006
Skipping T3-1-9919-010 : strange id_subject T3-1-9919-007
Skipping T3-1-9919-011 : strange id_subject T3-1-9919-008
Skipping T3-1-9920-007 : strange id_subject T3-1-9920-003
Skipping T3-1-9920-008 : strange id_subject T3-1-9920-004
Skipping T3-1-9920-009 : strange id_subject T3-1-9920-005
Skipping T3-1-9921-016 : strange id_subject T3-1-9921-007
Skipping T3-1-9921-017 : strange id_subject T3-1-9921-008
Skipping T3-1-9921-018 : strange id_subject T3-1-9921-009
Skipping T3-1-9921-019 : strange id_subject T3-1-9921-010
Skipping T3-1-9921-020 : strange id_subject T3-1-9921-011
Skipping T3

Skipping T3-1-9963-010 : strange id_subject T3-1-9963-007
Skipping T3-1-9963-011 : strange id_subject T3-1-9963-008
Skipping T3-1-9963-012 : strange id_subject T3-1-9963-009
Skipping T3-1-9964-010 : strange id_subject T3-1-9964-007
Skipping T3-1-9964-011 : strange id_subject T3-1-9964-008
Skipping T3-1-9964-012 : strange id_subject T3-1-9964-009
Skipping T3-1-9965-009 : strange id_subject T3-1-9965-006
Skipping T3-1-9965-010 : strange id_subject T3-1-9965-007
Skipping T3-1-9965-011 : strange id_subject T3-1-9965-008
Skipping T3-1-9966-005 : strange id_subject T3-1-9966-004
Skipping T3-1-9967-005 : strange id_subject T3-1-9967-004
Skipping T3-1-9967-008 : strange id_subject T3-1-9967-005
Skipping T3-1-9967-009 : strange id_subject T3-1-9967-006
Skipping T3-1-9967-010 : strange id_subject T3-1-9967-007
Skipping T3-1-9968-013 : strange id_subject T3-1-9968-009
Skipping T3-1-9968-014 : strange id_subject T3-1-9968-010
Skipping T3-1-9968-015 : strange id_subject T3-1-9968-011
Skipping T3-1-

In [19]:
statements

[{'id': '884614b7-8181-47fa-aca5-9e4acc17f214',
  '_source_id': 'T3-1-01-001',
  'actionId': 'A0164',
  'certainty': '',
  'epistemicLevel': '2-interpretive',
  'meta': '',
  'modality': '',
  'tree': {'id': '8a58a971-69f6-46a3-a8ec-dea23bde6808',
   '_label': '',
   'type': 'root',
   'actants': [{'id': '2cfaf91a-f796-4c1a-a0c0-b587aba3abe1',
     '_label': 'P0454',
     'type': 'subject',
     'props': []}],
   'props': []},
  'note': '',
  'resources': [],
  'tags': [],
  'territoryId': 'T3-1-01',
  'text': ''},
 {'id': '9c202e5d-342a-4fa6-9117-0bbaec02025f',
  '_source_id': 'T3-1-01-002',
  'actionId': 'A0093',
  'certainty': '',
  'epistemicLevel': '2-interpretive',
  'meta': '',
  'modality': '',
  'tree': {'id': '1af7a24a-ad4d-4566-9861-2066a75c83b0',
   '_label': '',
   'type': 'root',
   'actants': [{'id': '2cfaf91a-f796-4c1a-a0c0-b587aba3abe1',
     '_label': 'P0454',
     'type': 'subject',
     'props': []},
    {'id': '16ff2712-4628-4e9c-905c-d73c6e00c6f6',
     '_label': 

In [20]:
df_a.to_excel('sellan-actants.xlsx')
df_a.to_csv('sellan-actants.csv')
df_a.to_json('sellan-actants.json', orient='records')


statements_json = json.dumps(statements)
print(statements_json)

with open('sellan-statements.json', 'w') as file:
    file.write(statements_json)

[{"id": "884614b7-8181-47fa-aca5-9e4acc17f214", "_source_id": "T3-1-01-001", "actionId": "A0164", "certainty": "", "epistemicLevel": "2-interpretive", "meta": "", "modality": "", "tree": {"id": "8a58a971-69f6-46a3-a8ec-dea23bde6808", "_label": "", "type": "root", "actants": [{"id": "2cfaf91a-f796-4c1a-a0c0-b587aba3abe1", "_label": "P0454", "type": "subject", "props": []}], "props": []}, "note": "", "resources": [], "tags": [], "territoryId": "T3-1-01", "text": ""}, {"id": "9c202e5d-342a-4fa6-9117-0bbaec02025f", "_source_id": "T3-1-01-002", "actionId": "A0093", "certainty": "", "epistemicLevel": "2-interpretive", "meta": "", "modality": "", "tree": {"id": "1af7a24a-ad4d-4566-9861-2066a75c83b0", "_label": "", "type": "root", "actants": [{"id": "2cfaf91a-f796-4c1a-a0c0-b587aba3abe1", "_label": "P0454", "type": "subject", "props": []}, {"id": "16ff2712-4628-4e9c-905c-d73c6e00c6f6", "_label": "L0022", "type": "actant1", "props": []}], "props": []}, "note": "", "resources": [], "tags": [], "