In [1]:
from typing import Union, List
from pathlib import Path
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

mpdf = pd.read_pickle("mpdf_corrected.pickle")
mpdf["party"] = mpdf.party.fillna("NEZAVISNI")
partiesdf = pd.read_pickle("partiesdf_corrected.pickle")
termdata = pd.read_csv("../SRB/terms.csv").set_index("Term").astype(str)

def transform_abbreviation(s: str) -> str:
    return f"party.{s.replace(' ', '_').upper().replace('!', '')}"



from tqdm import tqdm
tqdm.pandas()



Fix missing dates where we have years available:

In [2]:
c = ( mpdf.date_of_birth == "-" )& (~mpdf.year_of_birth.isna()) & (~(mpdf.year_of_birth == "-"))
mpdf.loc[c, "date_of_birth"] = mpdf.year_of_birth[c]

Let's see which parties have non-trivial names:

In [3]:
gb = partiesdf.groupby("party").agg({
    "full_name": lambda l: len(set(l)),
    # "full_name": set
})
problematic_abbreviations = gb[gb.full_name != 1].index.values
problematic_abbreviations

gb = partiesdf[partiesdf.party.isin(problematic_abbreviations)].groupby("party").agg({
    "full_name": set,
})
gb

Unnamed: 0_level_0,full_name
party,Unnamed: 1_level_1
BS,"{Bogata Srbija, Bolja Srbija}"
DHSS,"{Demohrišćanska stranka Srbije, Demohrišćanska Stranka Srbije}"
LS,"{Liga za Šumadiju, Lista za Sandžak}"
PS,"{Pokret socilalista, Pokret socijalista}"
SDP,"{Sandžačka demokratska partija, Socijaldemokratska partija Srbije, Socijaldemokratska partija}"


# Recipe:

* `Demohrišćanska Stranka Srbije` -> `Demohrišćanska stranka Srbije` in accordance with Wikipedia.
* `Pokret socilalista` -> `Pokret socialista`, seems a human error.
* `Socijaldemokratska partija` -> `Socijaldemokratska partija Srbije`, in accordance with Wikipedia. The abbreviation to be changed to SDPS in accordance with Wikipedia. There is no extant party with SDPS abbreviation. `Sandžačka demokratska partija` can remain abbreviated SDP.
* `Liga za Šumadiju` to be assigned LŠ, `Lista za Sandžak` to be left LS
* `Bogata Srbija` to be assigned BogS, `Bolja Srbija` preserves BS

While this is easily done in parties table, but for MPs we have no party affiliation other than the abbreviation. This will have to be done manually for parties BS, LS, SDP.

In [4]:
partiesdf["full_name"] = partiesdf.full_name.replace({
    "Demohrišćanska Stranka Srbije": "Demohrišćanska stranka Srbije",
    "Pokret socilalista": "Pokret socijalista",
    "Socijaldemokratska partija": "Socijaldemokratska partija Srbije"
})

partiesdf.loc[partiesdf.full_name == "Liga za Šumadiju", "party"] = "LŠ"
partiesdf.loc[partiesdf.full_name == "Bogata Srbija", "party"] = "BogS"
partiesdf.loc[partiesdf.full_name == "Socijaldemokratska partija Srbije", "party"] = "SDPS"


In [5]:
# Inspection:
# mpdf[mpdf.party.isin(["BS", "LS", "SDP"])]

mpdf.loc[[4,218,215,231], "party"] = "LŠ"
mpdf.loc[[608, 653,  662, 919,940,1138,1183, 1215,1322,1357,1415, 1439, 1445, 1459,1464,1498, 1556, 1644, 1729,1735, 1746, 1757, 1760, 1770,1786,1815,1942,1952,1964,1969,2010,2015,2023,2031,2041,2066,2186,2238,2248,2257,2304,2306,2414,2428], "party"] = "SDPS"
mpdf.loc[[1593], "party"] = "BogS"

In [6]:
from xml.etree.ElementTree import Element, SubElement, tostring, XML, parse
from xml.dom import minidom
def pretty_print(s) -> None:
    print(minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t"))
def pretty_string(s) -> None:
    return minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t")

# Terms (line 438):

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/org[2]/listEvent`

In [7]:
listEvent = Element("listEvent")
head = SubElement(listEvent, "head")
head.set("xml:lang", "sr")
head.text = "Mandatno obdoblje"
head = SubElement(listEvent, "head")
head.set("xml:lang", "en")
head.text = "Legislative period"

for i, row in termdata.iterrows():
    event = SubElement(listEvent, "event")
    event.set("xml:id", f"NS.{i}")
    event.set("from", row["From"])
    to = row["To"]
    if not pd.isna(to):
        event.set("to", row["To"])
    
    label = SubElement(event, "label")
    label.set("xml:lang", "sr")
    label.text = f"{int(i)}. saziv"
    label = SubElement(event, "label")
    label.set("xml:lang", "en")
    label.text = f"Term {int(i)}"
listEvent_str = pretty_string(listEvent)
listEvent_str = "\n".join(listEvent_str.split("\n")[1:])
print(listEvent_str)

<listEvent>
	<head xml:lang="sr">Mandatno obdoblje</head>
	<head xml:lang="en">Legislative period</head>
	<event xml:id="NS.4" from="19971203" to="20010121">
		<label xml:lang="sr">4. saziv</label>
		<label xml:lang="en">Term 4</label>
	</event>
	<event xml:id="NS.5" from="20010122" to="20040126">
		<label xml:lang="sr">5. saziv</label>
		<label xml:lang="en">Term 5</label>
	</event>
	<event xml:id="NS.6" from="20040127" to="20070213">
		<label xml:lang="sr">6. saziv</label>
		<label xml:lang="en">Term 6</label>
	</event>
	<event xml:id="NS.7" from="20070214" to="20080610">
		<label xml:lang="sr">7. saziv</label>
		<label xml:lang="en">Term 7</label>
	</event>
	<event xml:id="NS.8" from="20080611" to="20120530">
		<label xml:lang="sr">8. saziv</label>
		<label xml:lang="en">Term 8</label>
	</event>
	<event xml:id="NS.9" from="20120531" to="20140415">
		<label xml:lang="sr">9. saziv</label>
		<label xml:lang="en">Term 9</label>
	</event>
	<event xml:id="NS.10" from="20140416" to="201606

# Parties: line 447

path:
`/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/org[3:]`

In [8]:
partiesdf["id"] = partiesdf.party.apply(transform_abbreviation)

partydata = partiesdf["party full_name id".split()].drop_duplicates()

orgs = []
for i, row in partydata.iterrows():
    org = Element("org")
    org.set("xml:id", row["id"])
    org.set("role", "parliamentaryGroup")
    orgName = SubElement(org, "orgName")
    orgName.set("full", "yes")
    orgName.set("xml:lang", "sr")
    orgName.text = row["full_name"]
    
    orgName = SubElement(org, "orgName")
    orgName.set("full", "abb")
    orgName.text = row["party"]
    orgs.append(org)
    
# Add nezavisni:
org = Element("org")
org.set("xml:id", "party.NEZAVISNI")
org.set("role", "parliamentaryGroup")
orgName = SubElement(org, "orgName")
orgName.set("full", "yes")
orgName.set("xml:lang", "sr")
orgName.text = "nezavisni"
orgName = SubElement(org, "orgName")
orgName.set("full", "abb")
orgName.text = "nezavisni"
orgs.append(org)

orgs = [pretty_string(i) for i in orgs]

orgstring = "\n".join(
    ["\n".join(org.split("\n")[1:])for org in orgs]
)
# print(orgstring)

# People: line 525

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listPerson`

In [9]:
listPerson = []
codemps = mpdf.codemp.drop_duplicates()
n = 0

from utils import drop_punctuation
for i, c in enumerate(codemps):    
    subset = mpdf.query(f"codemp=='{c}'").reset_index(drop=True)
    surname = subset.lastname[0]
    forename = subset.firstname[0]
    sex = "M" if subset.gender[0] == 0 else "F"
    birth = str(subset.date_of_birth[0])
    try:
        birth = f"{birth[0:4]}-{birth[4:6]}-{birth[-2:]}"
    except:
        pass
    reference_name = f"{surname.replace(' ', '')}{forename.replace(' ', '')}"
    reference_name = drop_punctuation(reference_name)
    
    person = Element("person")
    person.set("xml:id", reference_name)
    person.set("n", str(c))
    n += 1
    persName = SubElement(person, "persName")
    
    buf = SubElement(persName, "surname")
    buf.text = surname
    
    buf = SubElement(persName, "forename")
    buf.text = forename
    
    buf = SubElement(person, "sex")
    buf.set("value", sex)
    try:
        int(birth) # This will trip if date isn't a numeric string
        buf = SubElement(person, "birth")
        buf.set("when", birth)
    except:
        pass
    

    for i, row in subset.drop_duplicates().iterrows():  
        term = row["term2"]
        fromto = termdata.loc[term, :]
        aff = SubElement(person, "affiliation")
        aff.set("role", "member")
        aff.set("ref", f"#{transform_abbreviation(row['party'])}")
        aff.set("from", fromto.From)
        if not pd.isna(fromto.To):
            aff.set("to", fromto.To)
        
        rolename = SubElement(aff, "roleName")
        rolename.set("xml:lang", "en")
        rolename.text = "Member"

        aff = SubElement(person, "affiliation")
        aff.set("role", "member")
        aff.set("ref", "#NS")
        aff.set("ana", f"#NS.{row['term2']}")
        aff.set("from", fromto.From)
        if not pd.isna(fromto.To):
            aff.set("to", fromto.To)
        rolename = SubElement(aff, "roleName")
        rolename.set("xml:lang", "en")
        rolename.text = "MP"
    listPerson.append(person)

# # Add Unknown to listPersons:
# reference_name = "Unknown"
# person = Element("person")
# person.set("xml:id", reference_name)
# person.set("n", str(n))
# n += 1
# persName = SubElement(person, "persName")
# buf = SubElement(persName, "surname")
# buf.text = "Unknown"
# buf = SubElement(persName, "forename")
# buf.text = "Unknown"
# listPerson.append(person)

# # Add Marina to listPersons
# reference_name = "MatulovićDropulićMarina"
# person = Element("person")
# person.set("xml:id", reference_name)
# person.set("n", str(n))
# n += 1
# persName = SubElement(person, "persName")
# buf = SubElement(persName, "surname")
# buf.text = "Matulović Dropulić"
# buf = SubElement(persName, "forename")
# buf.text = "Marina"
# buf = SubElement(person, "sex")
# buf.set("value", "F")
# listPerson.append(person)



In [10]:
listPerson = ["\n".join(pretty_string(p).split("\n")[1:]) for p in listPerson]
listPerson = "\n".join(listPerson)


In [12]:
path_to_term_tmxs = Path("/home/rupnik/parlamint/SRB/S/")
assert path_to_term_tmxs.exists()


xiincludes = ""
for file in sorted(list(path_to_term_tmxs.glob("ParlaMint-RS_T*.xml"))):
    xiincludes += f"""    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="{file.name}"/>\n"""


# Relations

path: `/teiCorpus/teiHeader/profileDesc/particDesc/listOrg/listRelation`

In [13]:
# gb = partiesdf.groupby(["term2", "coalition"]).agg(
#     {
#         "id": lambda l: " ".join(set(["#"+i for i in l]))
#     }
# )
partiesdf["coalition"] = partiesdf.coalition.astype(str)
gb = partiesdf.groupby(["term2", "coalition"]).agg(
    {
        "id": lambda l: " ".join(set(["#"+i for i in l if i in partydata.id.values
                                      ]))
    }
)
terms = partiesdf.term2.unique()
listRelation = ""
for term in terms:
    if term > mpdf.term2.max():
        continue
    fromto = termdata.loc[term, :]
    ind = (term, "1")
    coalition = gb.loc[ind].id
    ind = (term, "0")
    opposition = gb.loc[ind].id
    listRelation += f"""
    <relation name="coalition"
            mutual="{coalition}"
            from="{fromto.From}"
            {'to="'+fromto.To+'"' if not pd.isna(fromto.To) else ""}
            ana="#NS.{term}"/>
    <relation name="opposition"
            active="{opposition}"
            passive="#government.RS"
            from="{fromto.From}"
            {'to="'+fromto.To+'"' if not pd.isna(fromto.To) else ""}
            ana="#NS.{term}"/>\n"""    

# Counting extents and tagUsages

In [14]:
from typing import Union
from pathlib import Path
def get_extent(path: Union[str, Path]):
    if isinstance(path, Path):
        assert path.exists()
        path = str(path)
    namespace= {
        "n": "http://www.tei-c.org/ns/1.0",
        
    }
    termtei = parse(path)
    measures = termtei.getroot().findall(".//n:measure", namespaces=namespace)
    for i in measures:
        if i.get("unit") == "speeches":
            speeches = i.get("quantity")
        if i.get("unit") == "words":
            words = i.get("quantity")
    return {"words": int(words), "speeches": int(speeches)}

def get_tagusage(path: Union[str, Path]):
    if isinstance(path, Path):
        assert path.exists()
        path = str(path)
    namespace= {
        "n": "http://www.tei-c.org/ns/1.0",
        
    }
    termtei = parse(path)
    tagusages = termtei.getroot().findall(".//n:tagUsage", namespaces=namespace)
    for i in tagusages:
        if i.get("unit") == "speeches":
            speeches = i.get("quantity")
        if i.get("unit") == "words":
            words = i.get("quantity")
    return {i.get("gi"): i.get("occurs") for i in tagusages}

path_to_term_tmxs = Path("/home/rupnik/parlamint/SRB/S/")
assert path_to_term_tmxs.exists()


extents = []
tagusages = []
for file in sorted(list(path_to_term_tmxs.glob("ParlaMint-RS_T*.xml"))):
    extents.append(get_extent(file))
    tagusages.append(get_tagusage(file))
tus = pd.DataFrame(data=tagusages).astype(int)

tagusage = ""
for gi, occurs in tus.sum().items():
    tagusage += f"""<tagUsage gi="{gi}" occurs="{int(occurs)}"/>\n"""
extent_count = pd.DataFrame(data=extents).astype(int).sum()
words = extent_count["words"]
speeches = extent_count["speeches"]
extent = f"""<measure unit="speeches" quantity="{speeches}" xml:lang="sr">{f'{speeches:,d}'.replace(',','.')} govora</measure>
<measure unit="speeches" quantity="{speeches}" xml:lang="en">{speeches:,d} speeches</measure>
<measure unit="words" quantity="{words}" xml:lang="sr">{f'{words:,d}'.replace(',','.')} reči</measure>
<measure unit="words" quantity="{words}" xml:lang="en">{words:,d} words</measure>
"""

# Putting it all together

In [15]:
with open("/home/rupnik/parlamint/SRB/ParlaMint-RS_template.xml") as f:
    content = f.read()
from string import Template
template = Template(content)
filled_in = template.substitute(
    listEvent=listEvent_str,
    orgs=orgstring,
    listPerson=listPerson,
    xiincludes=xiincludes,
    listRelation=listRelation,
    extent=extent,
    tagusage=tagusage
)

with open("/home/rupnik/parlamint/SRB/ParlaMint-RS.xml", "w") as f:
    f.write(
        filled_in
    )


In [16]:
%%bash
pwd

/home/rupnik/parlamint/task16


In [17]:
%%bash

mkdir ~/parlamint/ParlaMint/Data/ParlaMint-RS

cp ~/parlamint/SRB/S/* ~/parlamint/ParlaMint/Data/ParlaMint-RS/
cp ~/parlamint/SRB/ParlaMint-RS.xml ~/parlamint/ParlaMint/Data/ParlaMint-RS/


cd /home/rupnik/parlamint/ParlaMint

# Validation
make add-common-content-RS
cd Data/ParlaMint-RS/
cp add-common-content/* .
rm -r add-common-content

rm -rf Data/ParlaMint-RS/add-common-content
mkdir -p Data/ParlaMint-RS/add-common-content
java -jar /usr/share/java/saxon.jar outDir=Data/ParlaMint-RS/add-common-content \
   -xsl:Scripts/parlamint-add-common-content.xsl \
   Data/ParlaMint-RS/ParlaMint-RS.ana.xml || :
java -jar /usr/share/java/saxon.jar outDir=Data/ParlaMint-RS/add-common-content \
   anaDir=`pwd`/Data/ParlaMint-RS/add-common-content/ParlaMint-RS \
   -xsl:Scripts/parlamint-add-common-content.xsl \
   Data/ParlaMint-RS/ParlaMint-RS.xml || :
for component in `echo Data/ParlaMint-RS/ParlaMint-RS.ana.xml| xargs -I % java -cp /usr/share/java/saxon.jar net.sf.saxon.Query -xi:off \!method=adaptive -qs:'//*[local-name()="teiHeader"]//*[local-name()="include"]/@href' -s:% |sed 's/^ *href="//;s/"//'`; do \
  echo "copying header component: Data/ParlaMint-RS/${component}" ; \
    cp Data/ParlaMint-RS/${component} Data/ParlaMint-RS/add-common-content/ParlaMint-RS; \
done;
echo "Result is in: Data/ParlaMint-RS/add-common-content/

mkdir: cannot create directory ‘/home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS’: File exists
Source file Data/ParlaMint-RS/ParlaMint-RS.ana.xml does not exist
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS/add-common-content/ParlaMint-RS/ParlaMint-RS_T04_S1.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS/add-common-content/ParlaMint-RS/ParlaMint-RS_T04_S2.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS/add-common-content/ParlaMint-RS/ParlaMint-RS_T04_S3.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS/add-common-content/ParlaMint-RS/ParlaMint-RS_T04_S4.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS/add-common-content/ParlaMint-RS/ParlaMint-RS_T04_Sv1.ana.xml
ERROR : cannot locate .ana file /home/rupnik/parlamint/ParlaMint/Data/ParlaMint-RS/add-common-content/ParlaMint-RS/ParlaM

In [18]:
%%bash
cd /home/rupnik/parlamint/ParlaMint
make validate-parlamint-RS

# Notes on the output:


# Adding missing persons:


In [None]:
f = "008_validation_out"
with open(f) as f:
    content = f.readlines()

pattern = '''{begining}find local id for u/@who="#{name}"\n'''
from parse import compile
p = compile(pattern)

missing = set()
for line in content:
    result = p.parse(line)
    try:
        missing.add(result["name"])
    except:
        continue
missing

Unnamed: 0,codemp,order_id,term1,term2,term_id,type_of_list,fullname,firstname,lastname,party,date_of_birth,year_of_birth,gender,place_of_birth,field_of_study,education_y,bp_lat,bp_lon
30,M6,31,1997-2000,4,31,various,"Đorđević, Aleksandar",Aleksandar,Đorđević,SRS,-,1950,0,-,-,16,-,-
106,M7,107,1997-2000,4,107,various,"Đorđević, Aleksandar",Aleksandar,Đorđević,SRS,-,1950,0,Beograd,2,16,44.688045,20.22171
538,M7,539,2003-2007,6,18,normal,"Đorđević, Aleksandar",Aleksandar,Đorđević,SRS,19590820,1959,0,-,-,16,-,-
