In [42]:
from typing import Union, List
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

mpdf = pd.read_pickle("../BiH/mpdf_corrected.pickle")
mpdf["party"] = mpdf.party.fillna("NEZAVISNI")
partiesdf = pd.read_pickle("../BiH/partiesdf_corrected.pickle")
termdata = pd.read_csv("../BiH/terms.csv").set_index("Term").astype(str)


with open("005_additional_persons.pickle", "rb") as f:
    additional_persons = pickle.load(f)
with open("005_ignore_keys.pickle", "rb") as f:
    keys_to_delete = pickle.load(f)


from tqdm import tqdm
tqdm.pandas()



In [43]:
mpdf["reference_name"] = mpdf.fullname.apply(lambda fullname: "".join(fullname.split()).replace(",", "").replace(" ", "").replace("–", "-"))

We have to mitigate the fact that in a significant proportion of cases there is no codeMP available.

In these instances I will assign the codeMP as `i999`, with 999 standing for row number.

In [44]:
c = mpdf.codemp.isna()
new_code_mps = np.array([f"i{i:03d}" for i, j in enumerate(c)])

mpdf.loc[c, "codemp"] = new_code_mps[c]
mpdf.codemp.isna().sum()

0


Fix missing dates where we have years available:

In [45]:
c = ( mpdf.date_of_birth == "-" )& (~mpdf.year_of_birth.isna()) & (~(mpdf.year_of_birth == "-"))
mpdf.loc[c, "date_of_birth"] = mpdf.year_of_birth[c]

Let's see which parties have non-trivial names:

In [46]:
gb = partiesdf.groupby("party").agg({
    "full_name": lambda l: len(set(l)),
    # "full_name": set
})
problematic_abbreviations = gb[gb.full_name != 1].index.values
problematic_abbreviations

gb = partiesdf[partiesdf.party.isin(problematic_abbreviations)].groupby("party").agg({
    "full_name": set,
})
gb

Unnamed: 0_level_0,full_name
party,Unnamed: 1_level_1
DNZ_BiH,"{Demokratska narodna zajednica, Demokratska narodna zajednica BiH}"


<!-- # Recipe:

* `Demohrišćanska Stranka Srbije` -> `Demohrišćanska stranka Srbije` in accordance with Wikipedia.
* `Pokret socilalista` -> `Pokret socialista`, seems a human error.
* `Socijaldemokratska partija` -> `Socijaldemokratska partija Srbije`, in accordance with Wikipedia. The abbreviation to be changed to SDPS in accordance with Wikipedia. There is no extant party with SDPS abbreviation. `Sandžačka demokratska partija` can remain abbreviated SDP.
* `Liga za Šumadiju` to be assigned LŠ, `Lista za Sandžak` to be left LS
* `Bogata Srbija` to be assigned BogS, `Bolja Srbija` preserves BS

While this is easily done in parties table, but for MPs we have no party affiliation other than the abbreviation. This will have to be done manually for parties BS, LS, SDP. -->

In [47]:
partiesdf["full_name"] = partiesdf.full_name.replace({
    "Demokratska narodna zajednica": "Demokratska narodna zajednica BiH",
})

In [48]:
from xml.etree.ElementTree import Element, SubElement, tostring, XML, parse
from xml.dom import minidom
def pretty_print(s) -> None:
    print(minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t"))
def pretty_string(s) -> None:
    return minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t")

# Terms (line 438):

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/org[2]/listEvent`

In [49]:
listEvent = Element("listEvent")
head = SubElement(listEvent, "head")
head.set("xml:lang", "bs")
head.text = "Mandatno obdoblje"
head = SubElement(listEvent, "head")
head.set("xml:lang", "en")
head.text = "Legislative period"

for i, row in termdata.iterrows():
    event = SubElement(listEvent, "event")
    event.set("xml:id", f"PS.{i}")
    event.set("from", row["From"])
    to = row["To"]
    if not pd.isna(to):
        event.set("to", row["To"])
    
    label = SubElement(event, "label")
    label.set("xml:lang", "bs")
    label.text = f"{int(i)}. saziv"
    label = SubElement(event, "label")
    label.set("xml:lang", "en")
    label.text = f"Term {int(i)}"
listEvent_str = pretty_string(listEvent)
listEvent_str = "\n".join(listEvent_str.split("\n")[1:])

# Parties: line 447

path:
`/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/org[3:]`

In [50]:
partiesdf["id"] = partiesdf.party.copy()

partydata = partiesdf["party full_name id".split()].drop_duplicates()

orgs = []
for i, row in partydata.iterrows():
    org = Element("org")
    org.set("xml:id", row["id"])
    org.set("role", "parliamentaryGroup")
    orgName = SubElement(org, "orgName")
    orgName.set("full", "yes")
    orgName.set("xml:lang", "bs")
    orgName.text = row["full_name"]
    
    orgName = SubElement(org, "orgName")
    orgName.set("full", "abb")
    orgName.text = row["party"]
    orgs.append(org)
    
# Add nezavisni:
org = Element("org")
org.set("xml:id", "party.NEZAVISNI")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "bs")
orgName.text = "nezavisni"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "NEZAVISNI"
orgs.append(org)

# Add HSS:
org = Element("org")
org.set("xml:id", "party.HSS")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "bs")
orgName.text = "Hrvatska seljačka stranka"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "HSS"
orgs.append(org)

# Add GDS:
org = Element("org")
org.set("xml:id", "party.GDS")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "bs")
orgName.text = "Građanska demokratska stranka"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "GDS"
orgs.append(org)

# Add BSP:
org = Element("org")
org.set("xml:id", "party.BSP")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "bs")
orgName.text = "Bosanskohercegovačka stranka prava 1861"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "BSP"
orgs.append(org)


orgs = [pretty_string(i) for i in orgs]

orgstring = "\n".join(
    ["\n".join(org.split("\n")[1:])for org in orgs]
)
# print(orgstring)

# People: line 525

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listPerson`

In [51]:
listPerson = additional_persons
codemps = mpdf.codemp.drop_duplicates()


names = mpdf.reference_name.unique()
from utils import drop_punctuation
for i, name in enumerate(names):    
    subset = mpdf.query(f"reference_name=='{name}'").reset_index(drop=True)
    surname = subset.lastname[0]
    forename = subset.firstname[0]
    fullname = subset.fullname[0]
    sex = "M" if subset.gender[0] == 0 else "F"
    birth = str(subset.date_of_birth[0])
    c = subset.codemp[0]
    try:
        birth = f"{birth[0:4]}-{birth[4:6]}-{birth[-2:]}"
    except:
        pass
    reference_name = name
    
    person = Element("person")
    person.set("xml:id", reference_name)
    persName = SubElement(person, "persName")
    
    buf = SubElement(persName, "surname")
    buf.text = surname
    
    buf = SubElement(persName, "forename")
    buf.text = forename
    
    buf = SubElement(person, "sex")
    buf.set("value", sex)
    try:
        int(birth) # This will trip if date isn't a numerically-parsable string
        buf = SubElement(person, "birth")
        buf.set("when", birth)
    except:
        pass
    

    for i, row in subset.drop_duplicates().iterrows():  
        if not ((row['party'] == "-") or (row['party'] not in partiesdf.party.values.tolist())):
            term = row["term2"]
            fromto = termdata.loc[term, :]
            aff = SubElement(person, "affiliation")
            aff.set("role", "member")
            aff.set("ref", f"#{row['party']}")
            aff.set("from", fromto.From)
            if not pd.isna(fromto.To):
                aff.set("to", fromto.To)
            rolename = SubElement(aff, "roleName")
            rolename.set("xml:lang", "en")
            rolename.text = "Member"

        aff = SubElement(person, "affiliation")
        aff.set("role", "member")
        aff.set("ref", "#PS")
        aff.set("ana", f"#PS.{row['term2']}")
        aff.set("from", fromto.From)
        if not pd.isna(fromto.To):
            aff.set("to", fromto.To)
        rolename = SubElement(aff, "roleName")
        rolename.set("xml:lang", "en")
        rolename.text = "MP"
    listPerson.append(person)

In [52]:
listPerson = ["\n".join(pretty_string(p).split("\n")[1:]) for p in listPerson]
listPerson = "\n".join(listPerson)


In [53]:
path_to_term_tmxs = Path("/home/rupnik/parlamint/BiH/S/")
assert path_to_term_tmxs.exists()


xiincludes = ""
for file in sorted(list(path_to_term_tmxs.glob("ParlaMint-BA_T*.xml")))[0:3]:
    xiincludes += f"""    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="{file.name}"/>\n"""


# Relations

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/listRelation`

In [54]:
# gb = partiesdf.groupby(["term2", "coalition"]).agg(
#     {
#         "id": lambda l: " ".join(set(["#"+i for i in l]))
#     }
# )
partiesdf["coalition"] = partiesdf.coalition.astype(str)
gb = partiesdf.groupby(["term2", "coalition"]).agg(
    {
        "id": lambda l: " ".join(set(["#"+i for i in l if i in partydata.id.values
                                      ]))
    }
)
terms = partiesdf.term2.unique()
listRelation = ""
for term in terms:
    if term > mpdf.term2.max():
        continue
    fromto = termdata.loc[term, :]
    ind = (term, "1")
    coalition = gb.loc[ind].id
    ind = (term, "0")
    opposition = gb.loc[ind].id
    listRelation += f"""
    <relation name="coalition"
            mutual="{coalition}"
            from="{fromto.From}"
            {'to="'+fromto.To+'"' if not pd.isna(fromto.To) else ""}
            ana="#PS.{term}"/>
    <relation name="opposition"
            active="{opposition}"
            passive="#government.BA"
            from="{fromto.From}"
            {'to="'+fromto.To+'"' if not pd.isna(fromto.To) else ""}
            ana="#PS.{term}"/>\n"""    

# Counting extents and tagUsages

In [55]:
from typing import Union
from pathlib import Path
def get_extent(path: Union[str, Path]):
    if isinstance(path, Path):
        assert path.exists()
        path = str(path)
    namespace= {
        "n": "http://www.tei-c.org/ns/1.0",
        
    }
    termtei = parse(path)
    measures = termtei.getroot().findall(".//n:measure", namespaces=namespace)
    for i in measures:
        if i.get("unit") == "speeches":
            speeches = i.get("quantity")
        if i.get("unit") == "words":
            words = i.get("quantity")
    return {"words": int(words), "speeches": int(speeches)}

def get_tagusage(path: Union[str, Path]):
    if isinstance(path, Path):
        assert path.exists()
        path = str(path)
    namespace= {
        "n": "http://www.tei-c.org/ns/1.0",
        
    }
    termtei = parse(path)
    tagusages = termtei.getroot().findall(".//n:tagUsage", namespaces=namespace)
    for i in tagusages:
        if i.get("unit") == "speeches":
            speeches = i.get("quantity")
        if i.get("unit") == "words":
            words = i.get("quantity")
    return {i.get("gi"): i.get("occurs") for i in tagusages}

path_to_term_tmxs = Path("/home/rupnik/parlamint/BiH/S/")
assert path_to_term_tmxs.exists()


extents = []
tagusages = []
for file in sorted(list(path_to_term_tmxs.glob("ParlaMint-BA_T*.xml"))):
    extents.append(get_extent(file))
    tagusages.append(get_tagusage(file))
tus = pd.DataFrame(data=tagusages).astype(int)

tagusage = ""
for gi, occurs in tus.sum().items():
    tagusage += f"""<tagUsage gi="{gi}" occurs="{int(occurs)}"/>\n"""
extent_count = pd.DataFrame(data=extents).astype(int).sum()
words = extent_count["words"]
speeches = extent_count["speeches"]
extent = f"""<measure unit="speeches" quantity="{speeches}" xml:lang="bs">{f'{speeches:,d}'.replace(',','.')} govora</measure>
<measure unit="speeches" quantity="{speeches}" xml:lang="en">{speeches:,d} speeches</measure>
<measure unit="words" quantity="{words}" xml:lang="bs">{f'{words:,d}'.replace(',','.')} riječi</measure>
<measure unit="words" quantity="{words}" xml:lang="en">{words:,d} words</measure>
"""

# Putting it all together

In [56]:
with open("/home/rupnik/parlamint/BiH/ParlaMint-BA_template.xml") as f:
    content = f.read()
from string import Template
template = Template(content)
filled_in = template.substitute(
    listEvent=listEvent_str,
    orgs=orgstring,
    listPerson=listPerson,
    xiincludes=xiincludes,
    listRelation=listRelation,
    extent=extent,
    tagusage=tagusage
)

with open("/home/rupnik/parlamint/BiH/ParlaMint-BA.xml", "w") as f:
    f.write(
        filled_in
    )


In [57]:
%%bash

mkdir ~/parlamint/ParlaMint/Data/ParlaMint-BA

cp ~/parlamint/BiH/S/* ~/parlamint/ParlaMint/Data/ParlaMint-BA/
cp ~/parlamint/BiH/ParlaMint-BA.xml ~/parlamint/ParlaMint/Data/ParlaMint-BA/

mkdir: cannot create directory ‘/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA’: File exists


In [58]:
# Fixing unusual speakers in the copied data:

import os, shutil
for key in keys_to_delete:
    os.system(f"""sed -i 's/<u who="#{key}"/<u/g' /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/* """)
os.system(f"""sed -i 's/<u who="#LučićMiloš;JovićNedeljko"/<u who="#LučićMiloš"/g' /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/* """)
os.system(f"""sed -i 's/"Marinković–LepićMirjana"/"Marinković-LepićMirjana"/g' /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/* """)
os.system(f"""sed -i 's/"Novaković–BursaćSnježana"/"Novaković-BursaćSnježana"/g' /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/* """)

0

In [59]:
%%bash


cd /home/rupnik/parlamint/ParlaMint

# Validation
make add-common-content-BA
cd Data/ParlaMint-BA/
cp add-common-content/* .
rm -r add-common-content

rm -rf Data/ParlaMint-BA/add-common-content
mkdir -p Data/ParlaMint-BA/add-common-content
java -jar /usr/share/java/saxon.jar outDir=Data/ParlaMint-BA/add-common-content \
   -xsl:Scripts/parlamint-add-common-content.xsl \
   Data/ParlaMint-BA/ParlaMint-BA.ana.xml || :
java -jar /usr/share/java/saxon.jar outDir=Data/ParlaMint-BA/add-common-content \
   anaDir=`pwd`/Data/ParlaMint-BA/add-common-content/ParlaMint-BA \
   -xsl:Scripts/parlamint-add-common-content.xsl \
   Data/ParlaMint-BA/ParlaMint-BA.xml || :
for component in `echo Data/ParlaMint-BA/ParlaMint-BA.ana.xml| xargs -I % java -cp /usr/share/java/saxon.jar net.sf.saxon.Query -xi:off \!method=adaptive -qs:'//*[local-name()="teiHeader"]//*[local-name()="include"]/@href' -s:% |sed 's/^ *href="//;s/"//'`; do \
  echo "copying header component: Data/ParlaMint-BA/${component}" ; \
    cp Data/ParlaMint-BA/${component} Data/ParlaMint-BA/add-common-content/ParlaMint-BA; \
done;
echo "Result is in: Data/ParlaMint-BA/add-common-content/

Source file Data/ParlaMint-BA/ParlaMint-BA.ana.xml does not exist
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/add-common-content/ParlaMint-BA/ParlaMint-BA_T02S01.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/add-common-content/ParlaMint-BA/ParlaMint-BA_T02S02.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/add-common-content/ParlaMint-BA/ParlaMint-BA_T02S03.ana.xml
INFO: Starting to process ParlaMint-BA
INFO: Processing ParlaMint-BA_T02S01.xml
ERROR ParlaMint-BA_T02S01: no date in setting!
Error at char 64 in expression in xsl:when/@test on line 328 column 59 of parlamint-add-common-content.xsl:
  FORG0001  Invalid date "" (Too short)
  In template rule with match="element(Q{http://www.tei-c.org/ns/1.0}TEI)/attribute(Q{}ana)" on line 307 of parlamint-add-common-content.xsl
Invalid date "" (Too short)
Source file Data/ParlaMint-BA/ParlaMint-BA.ana.xml does not e

In [60]:
%%bash
cd /home/rupnik/parlamint/ParlaMint
make validate-parlamint-BA

Scripts/validate-parlamint.pl Schema 'Data/ParlaMint-BA'


INFO: Validating directory /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA
INFO: Validating TEI root /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMint-BA.xml
INFO: XML validation for ParlaMint-BA.xml
/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMint-BA.xml:1521:43: error: value of attribute "xml:id" is invalid; must be an XML name without colons
/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMint-BA.xml:1787:37: error: value of attribute "xml:id" is invalid; must be an XML name without colons
/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMint-BA.xml:7489:42: error: ID "Marinković-LepićMirjana" has already been defined
/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMint-BA.xml:1808:42: error: first occurrence of ID "Marinković-LepićMirjana"
/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMint-BA.xml:7601:43: error: ID "Novaković-BursaćSnježana" has already been defined
/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-BA/ParlaMi

# Notes on the output:


# Adding missing persons:


In [61]:
# # f = "005_output.txt"
# # with open(f) as f:
# #     content = f.readlines()

# # pattern = '''{begining}find local id for u/@who="#{name}"\n'''
# # from parse import compile
# # p = compile(pattern)

# # missing = set()
# # for line in content:
# #     result = p.parse(line)
# #     try:
# #         missing.add(result["name"])
# #     except:
# #         continue
    
# with open("005_missings.txt", "r") as f:
#     content = f.read()
# from ast import literal_eval
# missings = literal_eval(content)


# def split_on_capitals(s:str):
#     splits = []
#     for i, c in enumerate(s):
#         if i == 0:
#             pass
#         if c == c.capitalize():
#             splits.append(i)
#     splits.append(-1)
#     return [s[start:stop] if stop != splits[-1] else s[start:] for start, stop in zip(splits, splits[1:])]

# length = dict()
# segments = dict()
# for i in missings:
#     length[i] = len(split_on_capitals(i))
#     segments[i] = split_on_capitals(i)
#     if length[i] != 2:
#         print(i)
        
        
# def generate_person(reference_name, lastname, firstname):
#     person = Element("person")
#     person.set("xml:id", reference_name)
#     persName = SubElement(person, "persName")
    
#     buf = SubElement(persName, "surname")
#     buf.text = lastname
    
#     buf = SubElement(persName, "forename")
#     buf.text = firstname

#     return person

# pretty_print(generate_person("PelivanJure", "Pelivan", "Jure"))


# additional_persons = list()
# to_delete = list()
# for key in missings:
#     l = length.get(key)
#     s = segments.get(key)
#     if l == 2:
#         additional_persons.append(generate_person(key,s[1],s[0]))
#     else:
#         print(f"The key is: {key}", flush=True)
#         lastname = input("Lastname:")
#         if lastname == "":
#             to_delete.append(key)
#             continue
#         firstname = input("Firstname")
#         if firstname == "":
#             continue
#         additional_persons.append(generate_person(key,lastname, firstname))
        
# import pickle
# with open("005_additional_persons.pickle", "wb") as f:
#     pickle.dump(additional_persons, f)
# with open("005_ignore_keys.pickle", "wb") as f:
#     pickle.dump(to_delete, f)
    

# with open("005_additional_persons.pickle", "rb") as f:
#     additional_persons_unpickled = pickle.load(f)
# with open("005_ignore_keys.pickle", "rb") as f:
#     keys_to_delete = pickle.load(f)