In [1]:
from typing import Union, List
from pathlib import Path
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

mpdf = pd.read_pickle("mpdf_corrected")
partiesdf = pd.read_pickle("partiesdf_corrected")
termdata = pd.read_csv("../terms.csv").set_index("Term")

def transform_abbreviation(s: str) -> str:
    return f"party.{s.replace(' ', '_').upper().replace('!', '')}"

# Fix ambiguous codemps:
mpdf.loc[mpdf.fullname.isin(["Lovrić Merzel, Marina"]), "codemp"] = "M393"
mpdf.loc[mpdf.fullname.isin(["Vrbat Grgić, Tanja"]), "codemp"] = "M581" 

# Fix swapped first- and last-names:
def make_firstname(s: str) -> str:
    return s.split(", ")[-1]
def make_lastname(s: str) -> str:
    return s.split(", ")[0]
from tqdm import tqdm
tqdm.pandas()
mpdf["firstname"] = mpdf.fullname.progress_apply(make_firstname)
mpdf["lastname"] = mpdf.fullname.progress_apply(make_lastname)


  0%|          | 0/1141 [00:00<?, ?it/s]100%|██████████| 1141/1141 [00:00<00:00, 413058.94it/s]
  0%|          | 0/1141 [00:00<?, ?it/s]100%|██████████| 1141/1141 [00:00<00:00, 718789.56it/s]


# Preparing persons:

In [2]:
from xml.etree.ElementTree import Element, SubElement, tostring, XML, parse
from xml.dom import minidom
def pretty_print(s) -> None:
    print(minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t"))
def pretty_string(s) -> None:
    return minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t")

# Terms (line 438):

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/org[2]/listEvent`

In [3]:
listEvent = Element("listEvent")
head = SubElement(listEvent, "head")
head.set("xml:lang", "hr")
head.text = "Mandatno obdoblje"
head = SubElement(listEvent, "head")
head.set("xml:lang", "en")
head.text = "Legislative period"

for i, row in termdata.iterrows():
    event = SubElement(listEvent, "event")
    event.set("xml:id", f"HS.{i}")
    event.set("from", row["From"])
    to = row["To"]
    if not pd.isna(to):
        event.set("to", row["To"])
    
    label = SubElement(event, "label")
    label.set("xml:lang", "hr")
    label.text = f"{i}. mandat"
    label = SubElement(event, "label")
    label.set("xml:lang", "en")
    label.text = f"Term {i}"
listEvent_str = pretty_string(listEvent)
listEvent_str = "\n".join(listEvent_str.split("\n")[1:])
print(listEvent_str)

<listEvent>
	<head xml:lang="hr">Mandatno obdoblje</head>
	<head xml:lang="en">Legislative period</head>
	<event xml:id="HS.5" from="2003-12-22" to="2007-10-12">
		<label xml:lang="hr">5. mandat</label>
		<label xml:lang="en">Term 5</label>
	</event>
	<event xml:id="HS.6" from="2008-01-11" to="2011-10-28">
		<label xml:lang="hr">6. mandat</label>
		<label xml:lang="en">Term 6</label>
	</event>
	<event xml:id="HS.7" from="2011-12-22" to="2015-09-25">
		<label xml:lang="hr">7. mandat</label>
		<label xml:lang="en">Term 7</label>
	</event>
	<event xml:id="HS.8" from="2015-12-03" to="2016-07-15">
		<label xml:lang="hr">8. mandat</label>
		<label xml:lang="en">Term 8</label>
	</event>
	<event xml:id="HS.9" from="2016-10-14" to="2020-05-18">
		<label xml:lang="hr">9. mandat</label>
		<label xml:lang="en">Term 9</label>
	</event>
	<event xml:id="HS.10" from="2020-07-22">
		<label xml:lang="hr">10. mandat</label>
		<label xml:lang="en">Term 10</label>
	</event>
</listEvent>



# Parties: line 447

path:
`/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/org[3:]`

In [10]:
partiesdf["id"] = partiesdf.party.apply(transform_abbreviation)

partydata = partiesdf["party full_name id".split()].drop_duplicates()

orgs = []
for i, row in partydata.iterrows():
    org = Element("org")
    org.set("xml:id", row["id"])
    org.set("role", "parliamentaryGroup")
    orgName = SubElement(org, "orgName")
    orgName.set("full", "yes")
    orgName.set("xml:lang", "hr")
    orgName.text = row["full_name"]
    
    orgName = SubElement(org, "orgName")
    orgName.set("full", "abb")
    orgName.text = row["party"]
    orgs.append(org)
    
# Add nezavisni:
org = Element("org")
org.set("xml:id", "party.NEZAVISNI")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "hr")
orgName.text = "nezavisni"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "nezavisni"
orgs.append(org)

# Add Hrvatski suverenisti:
org = Element("org")
org.set("xml:id", "party.HS")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "hr")
orgName.text = "Hrvatski Suverenisti"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "HS"
orgs.append(org)

# Add Hrvatska konzervativna stranka
org = Element("org")
org.set("xml:id", "party.HKS")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "hr")
orgName.text = "Hrvatska konzervativna stranka"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "HKS"
orgs.append(org)


orgs = [pretty_string(i) for i in orgs]

orgstring = "\n".join(
    ["\n".join(org.split("\n")[1:])for org in orgs]
)
# print(orgstring)

# People: line 525

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listPerson`

In [11]:
for name in mpdf.fullname.unique():
    if mpdf.loc[mpdf.fullname==name, "codemp"].unique().shape[0] != 1:
        print(name, "\n", mpdf.loc[mpdf.fullname==name, "codemp"])

In [12]:
listPerson = []
codemps = mpdf.codemp.drop_duplicates()
n = 0
for i, c in enumerate(codemps):    
    subset = mpdf.query(f"codemp=='{c}'").reset_index(drop=True)
    surname = subset.lastname[0]
    forename = subset.firstname[0]
    sex = "M" if subset.gender[0] == 0 else "F"
    birth = str(subset.date_of_birth[0])
    birth = f"{birth[0:4]}-{birth[4:6]}-{birth[-2:]}"
    reference_name = f"{surname.replace(' ', '')}{forename.replace(' ', '')}"
    
    person = Element("person")
    person.set("xml:id", reference_name)
    person.set("n", str(c))
    n += 1
    persName = SubElement(person, "persName")
    
    buf = SubElement(persName, "surname")
    buf.text = surname
    
    buf = SubElement(persName, "forename")
    buf.text = forename
    
    buf = SubElement(person, "sex")
    buf.set("value", sex)
    
    buf = SubElement(person, "birth")
    buf.set("when", birth)
    

    for i, row in subset.drop_duplicates().iterrows():  
        term = row["term2"]
        fromto = termdata.loc[term, :]
        aff = SubElement(person, "affiliation")
        aff.set("role", "member")
        aff.set("ref", f"#{transform_abbreviation(row['party'])}")
        aff.set("from", fromto.From)
        if not pd.isna(fromto.To):
            aff.set("to", fromto.To)
        
        rolename = SubElement(aff, "roleName")
        rolename.set("xml:lang", "en")
        rolename.text = "Member"

        aff = SubElement(person, "affiliation")
        aff.set("role", "member")
        aff.set("ref", "#HS")
        aff.set("ana", f"#HS.{row['term2']}")
        aff.set("from", fromto.From)
        if not pd.isna(fromto.To):
            aff.set("to", fromto.To)
        rolename = SubElement(aff, "roleName")
        rolename.set("xml:lang", "en")
        rolename.text = "MP"
    listPerson.append(person)

# # Add Unknown to listPersons:
# reference_name = "Unknown"
# person = Element("person")
# person.set("xml:id", reference_name)
# person.set("n", str(n))
# n += 1
# persName = SubElement(person, "persName")
# buf = SubElement(persName, "surname")
# buf.text = "Unknown"
# buf = SubElement(persName, "forename")
# buf.text = "Unknown"
# listPerson.append(person)

# Add Marina to listPersons
reference_name = "MatulovićDropulićMarina"
person = Element("person")
person.set("xml:id", reference_name)
person.set("n", str(n))
n += 1
persName = SubElement(person, "persName")
buf = SubElement(persName, "surname")
buf.text = "Matulović Dropulić"
buf = SubElement(persName, "forename")
buf.text = "Marina"
buf = SubElement(person, "sex")
buf.set("value", "F")
listPerson.append(person)



In [13]:
listPerson = ["\n".join(pretty_string(p).split("\n")[1:]) for p in listPerson]
listPerson = "\n".join(listPerson)


In [14]:
path_to_term_tmxs = Path("/home/rupnik/parlamint/S/")
assert path_to_term_tmxs.exists()


xiincludes = ""
for file in sorted(list(path_to_term_tmxs.glob("ParlaMint-HR_T*.xml"))):
    xiincludes += f"""    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="{file.name}"/>\n"""


# Relations

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/listRelation`

In [15]:
# gb = partiesdf.groupby(["term2", "coalition"]).agg(
#     {
#         "id": lambda l: " ".join(set(["#"+i for i in l]))
#     }
# )

gb = partiesdf.groupby(["term2", "coalition"]).agg(
    {
        "id": lambda l: " ".join(set(["#"+i for i in l if i in partydata.id.values
                                      ]))
    }
)
terms = partiesdf.term2.unique()
listRelation = ""
for term in terms:
    fromto = termdata.loc[term, :]
    ind = (term, 1)
    coalition = gb.loc[ind].id
    ind = (term, 0)
    opposition = gb.loc[ind].id
    listRelation += f"""
    <relation name="coalition"
            mutual="{coalition}"
            from="{fromto.From}"
            {'to="'+fromto.To+'"' if not pd.isna(fromto.To) else ""}
            ana="#HS.{term}"/>
    <relation name="opposition"
            active="{opposition}"
            passive="#government.HR"
            from="{fromto.From}"
            {'to="'+fromto.To+'"' if not pd.isna(fromto.To) else ""}
            ana="#HS.{term}"/>\n"""    

# Counting extents and tagUsages

In [16]:
from typing import Union
from pathlib import Path
def get_extent(path: Union[str, Path]):
    if isinstance(path, Path):
        assert path.exists()
        path = str(path)
    namespace= {
        "n": "http://www.tei-c.org/ns/1.0",
        
    }
    termtei = parse(path)
    measures = termtei.getroot().findall(".//n:measure", namespaces=namespace)
    for i in measures:
        if i.get("unit") == "speeches":
            speeches = i.get("quantity")
        if i.get("unit") == "words":
            words = i.get("quantity")
    return {"words": int(words), "speeches": int(speeches)}

def get_tagusage(path: Union[str, Path]):
    if isinstance(path, Path):
        assert path.exists()
        path = str(path)
    namespace= {
        "n": "http://www.tei-c.org/ns/1.0",
        
    }
    termtei = parse(path)
    tagusages = termtei.getroot().findall(".//n:tagUsage", namespaces=namespace)
    for i in tagusages:
        if i.get("unit") == "speeches":
            speeches = i.get("quantity")
        if i.get("unit") == "words":
            words = i.get("quantity")
    return {i.get("gi"): i.get("occurs") for i in tagusages}

path_to_term_tmxs = Path("/home/rupnik/parlamint/S/")
assert path_to_term_tmxs.exists()


extents = []
tagusages = []
for file in sorted(list(path_to_term_tmxs.glob("ParlaMint-HR_T*.xml"))):
    extents.append(get_extent(file))
    tagusages.append(get_tagusage(file))

tagusage = ""
import pandas as pd
tus = pd.DataFrame(data=tagusages).astype(int)
for gi, occurs in tus.sum().items():
    tagusage += f"""<tagUsage gi="{gi}" occurs="{int(occurs)}"/>\n"""
extent_count = pd.DataFrame(data=extents).astype(int).sum()
words = extent_count["words"]
speeches = extent_count["speeches"]
extent = f"""<measure unit="speeches" quantity="{speeches}" xml:lang="hr">{f'{speeches:,d}'.replace(',','.')} govora</measure>
<measure unit="speeches" quantity="{speeches}" xml:lang="en">{speeches:,d} speeches</measure>
<measure unit="words" quantity="{words}" xml:lang="hr">{f'{words:,d}'.replace(',','.')} riječi</measure>
<measure unit="words" quantity="{words}" xml:lang="en">{words:,d} words</measure>
"""

# Putting it all together

In [20]:
with open("/home/rupnik/parlamint/T/ParlaMint-HR_template2.xml") as f:
    content = f.read()
from string import Template
template = Template(content)
filled_in = template.substitute(
                    listEvent=listEvent_str,
                    orgs=orgstring,
                    listPerson = listPerson,
                    xiincludes = xiincludes,
                    listRelation = listRelation,
                    extent = extent,
                    tagusage = tagusage
                    )

with open("/home/rupnik/parlamint/S/ParlaMint-HR.xml", "w") as f:
    f.write(
        filled_in
    )

In [21]:
%%bash
pwd

/home/rupnik/parlamint/task15


In [24]:
%%bash

cp /home/rupnik/parlamint/S/ParlaMint-HR*.xml /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/
# cp /home/rupnik/parlamint/S/ParlaMint-HR.xml /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/

cd /home/rupnik/parlamint/ParlaMint
make add-common-content-HR
cd Data/ParlaMint-HR/
cp add-common-content/* .
rm -r add-common-content

rm -rf Data/ParlaMint-HR/add-common-content
mkdir -p Data/ParlaMint-HR/add-common-content
java -jar /usr/share/java/saxon.jar outDir=Data/ParlaMint-HR/add-common-content \
   -xsl:Scripts/parlamint-add-common-content.xsl \
   Data/ParlaMint-HR/ParlaMint-HR.ana.xml || :
java -jar /usr/share/java/saxon.jar outDir=Data/ParlaMint-HR/add-common-content \
   anaDir=`pwd`/Data/ParlaMint-HR/add-common-content/ParlaMint-HR \
   -xsl:Scripts/parlamint-add-common-content.xsl \
   Data/ParlaMint-HR/ParlaMint-HR.xml || :
for component in `echo Data/ParlaMint-HR/ParlaMint-HR.ana.xml| xargs -I % java -cp /usr/share/java/saxon.jar net.sf.saxon.Query -xi:off \!method=adaptive -qs:'//*[local-name()="teiHeader"]//*[local-name()="include"]/@href' -s:% |sed 's/^ *href="//;s/"//'`; do \
  echo "copying header component: Data/ParlaMint-HR/${component}" ; \
    cp Data/ParlaMint-HR/${component} Data/ParlaMint-HR/add-common-content/ParlaMint-HR; \
done;
echo "Result is in: Data/ParlaMint-HR/add-common-content/

Source file Data/ParlaMint-HR/ParlaMint-HR.ana.xml does not exist
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/add-common-content/ParlaMint-HR/ParlaMint-HR_T05_S01.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/add-common-content/ParlaMint-HR/ParlaMint-HR_T05_S02.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/add-common-content/ParlaMint-HR/ParlaMint-HR_T05_S03.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/add-common-content/ParlaMint-HR/ParlaMint-HR_T05_S04.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/add-common-content/ParlaMint-HR/ParlaMint-HR_T05_S05.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/add-common-content/ParlaMint-HR/ParlaMint-HR_T05_S06.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/

In [1]:
%%bash
cd /home/rupnik/parlamint/ParlaMint
make validate-parlamint-HR

Scripts/validate-parlamint.pl Schema 'Data/ParlaMint-HR'


INFO: Validating directory /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR
INFO: Validating TEI root /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-HR/ParlaMint-HR.xml
INFO: XML validation for ParlaMint-HR.xml
INFO: Content validaton for ParlaMint-HR.xml
INFO: particDesc content validaton for ParlaMint-HR.xml
INFO[10] : Total number of affiliations with government.HR: 0
ERROR[10] : government-role organisation without affiliation: #government.HR
INFO[10] : Total number of affiliations with HS: 1141
INFO[10] : Total number of affiliations with party.DC: 4
INFO[10] : Total number of affiliations with party.HDSSB: 23
INFO[10] : Total number of affiliations with party.HDZ: 458
INFO[10] : Total number of affiliations with party.HNS: 56
INFO[10] : Total number of affiliations with party.HSLS: 9
INFO[10] : Total number of affiliations with party.HSP: 8
INFO[10] : Total number of affiliations with party.HSS: 35
INFO[10] : Total number of affiliations with party.HSU: 14
INFO[10] : Total num

# Notes on the output:
<!-- 
`ParlaMint-HR.xml:473:55: error: value of attribute "to" is invalid; must be an ISO year and month, must be an ISO date, must be an ISO date and time or must be a year`: this is because I set the end of term 10 as ongoing instead of a date. 

`ParlaMint-HR.xml:595:66: error: value of attribute "xml:id" is invalid; must be an XML name without colons`: ~~Will try to fix by removing punctuations from party references. There aren't any colons in the name, though.~~ Yup, this worked.

`ParlaMint-HR.xml:727:26: error: value of attribute "name" is invalid; must be equal to "coalition", "opposition", "renaming", "representing" or "successor"`: value was "oposition" [sic.] This is since corrected. -->

<!-- ```
ParlaMint-HR.xml:7829:45: error: ID "LovrićMerzelMarina" has already been defined
ParlaMint-HR.xml:6797:45: error: first occurrence of ID "LovrićMerzelMarina"
ParlaMint-HR.xml:8645:42: error: ID "VrbatGrgićTanja" has already been defined
ParlaMint-HR.xml:6506:42: error: first occurrence of ID "VrbatGrgićTanja"
```
this stems from the fact that those two MP have twin `codeMPs`.  -->

<!-- Errors I should focus on:
`ERROR ParlaMint-HR_T06: ERROR: Can't find local id for u/@who="#ErrorError:row['Speaker_name']='Nepoznat',row['lastname']=nan,row['firstname']=nan"`: Clearly missing data.

Other errors that I do not know how to remedy:

```
ERROR ParlaMint-HR: Duplicate party affiliation for #HS
ERROR ParlaMint-HR: Duplicate party affiliation for #party.SDP
```

ERROR ParlaMint-HR_T05: ERROR: Can't find local id for u/@who="#PalarićAntun"
 -->


# Adding missing persons:
~~Only {'MatulovićDropulićMarina'} will have to be added artificially.~~ Added {'MatulovićDropulićMarina'}.

In [None]:
f = "008_validation_out"
with open(f) as f:
    content = f.readlines()
# Pattern to catch:
# ERROR: Can't find local id for u/@who="#MatulovićDropulićMarina"

pattern = '''{begining}find local id for u/@who="#{name}"\n'''
from parse import compile
p = compile(pattern)

missing = set()
for line in content:
    result = p.parse(line)
    try:
        missing.add(result["name"])
    except:
        continue
missing

set()