After header, we have the following hierarchy:

```xml
<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0"
     xml:lang="hr"
     xml:id="ParlaMint-HR_S02"
     ana="#parla.session #reference">
   <teiHeader>
   </teiHeader>
   <text xml:lang="hr" ana="#reference">
      <body>
         <div type="debateSection">
            <head>2. sjednica</head>
            <u who="#PetrovBožo"
               ana="#chair"
               xml:id="ParlaMint-HR_S02.u1"
               n="325729">
               <seg xml:id="seg21209">Gospođe i gospodo zastupnici...</seg>
               <seg xml:id="seg21210">Prvi na redu za postavljanja ...</seg>
               <seg xml:id="seg21211">Izvolite.</seg>
            </u>
         </div>
      </body>
   </text>
</TEI>
```

In [1]:
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
TEI = Element('TEI')
teiHEADER = SubElement(TEI, "teiHEADER")
teiHEADER.text = "This is a draft"

text = SubElement(TEI, "text")
body = SubElement(text, "body")
div = SubElement(body, "div")
head = SubElement(div, "head")
head.text = "2. sjednica"

u = SubElement(div, "u")
u.set("who", "#PetrovBožo")
u.set("ana", "#chair")
u.set("xml:id","ParlaMint-HR_S02.u1")
u.set("n","325729")

for i in range(5):
    seg = SubElement(u, "seg")
    seg.set("xml:id", "seg"+str(i))
    seg.text = "blabla"+str(i)

TEI.append(seg)
u = SubElement(div, "u")
u.set("who", "me")
u.set("ana", "#civilian")

seg = SubElement(u, "seg")
seg.set("xml:id", "seg213123123")
seg.text = "blablabla"

TEI.append(u)
def pretty_print(s:bytes) -> None:
    print(minidom.parseString(tostring(s).decode("utf")).toprettyxml("\t"))

pretty_print(TEI)


<?xml version="1.0" ?>
<TEI>
	<teiHEADER>This is a draft</teiHEADER>
	<text>
		<body>
			<div>
				<head>2. sjednica</head>
				<u who="#PetrovBožo" ana="#chair" xml:id="ParlaMint-HR_S02.u1" n="325729">
					<seg xml:id="seg0">blabla0</seg>
					<seg xml:id="seg1">blabla1</seg>
					<seg xml:id="seg2">blabla2</seg>
					<seg xml:id="seg3">blabla3</seg>
					<seg xml:id="seg4">blabla4</seg>
				</u>
				<u who="me" ana="#civilian">
					<seg xml:id="seg213123123">blablabla</seg>
				</u>
			</div>
		</body>
	</text>
	<seg xml:id="seg4">blabla4</seg>
	<u who="me" ana="#civilian">
		<seg xml:id="seg213123123">blablabla</seg>
	</u>
</TEI>



In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', 30)
pd.set_option('display.max_columns', None)

merged = pd.read_pickle("01_merged_data")


In [3]:
def get_who_field(row) -> str:
    try:
        lastname = "".join(row["lastname"].split())
        firstname = "".join(row["firstname"].split())
        return f"#{lastname}{firstname}"
    except:
        try:
            lastname = row["Speaker_name"].split(",")[0]
            firstname = row["Speaker_name"].split(",")[1]
            lastname = "".join(lastname.split())
            firstname = "".join(firstname.split())
            return f"#{lastname}{firstname}"
        except:
            print("Getting errors for ", row["Speaker_name"], row["lastname"], row["firstname"])

def get_ana_field(row) -> str:
    mapping = dict(
        Chairperson="#chair",
        Regular="#regular"
    )
    try:
        return mapping.get(row["Speaker_role"])
    except KeyError:
        raise KeyError("Can't find mapping for "+row["Speaker_role"])

Looping prep

In [4]:
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
TEI = Element('TEI')
teiHEADER = SubElement(TEI, "teiHEADER")
teiHEADER.text = "Header should be filled out. Right now it is not."

text = SubElement(TEI, "text")
body = SubElement(text, "body")
div = SubElement(body, "div")

current_u_n = 0
seg_index = 0
file_index = 5
title = None
for i, row in merged.iterrows():
    if row["Title"] != title:
        head = SubElement(div, "head")
        head.text = row["Title"]
        title = row["Title"]
    u = SubElement(div, "u")
    u.set("who", get_who_field(row))
    u.set("ana", get_ana_field(row))
    u.set("xml:id", row["ID"])
    u.set("n",str(current_u_n))

    for segment in row["sentences"]:
        seg = SubElement(u, "seg")
        seg.set("xml:id", f"seg{file_index:02}{seg_index:010}")
        seg.text = segment
        seg_index += 1
    current_u_n += 1

In [7]:
with open(f"ParlaMint-HR_{file_index:02}.xml", "w") as f:
    f.write(minidom.parseString(tostring(TEI).decode("utf")).toprettyxml("\t"))

In [38]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
gb = merged.groupby(["Speaker_name", "Term", "Session", "Meeting"]).Speaker_role.apply(set).dropna().apply(len)
gb = gb[~(gb.values.astype(int) == 1)]
gb.reset_index().Speaker_name.unique().tolist()

['Adlešič, Đurđa',
 'Antičević Marinović, Ingrid',
 'Arlović, Mato',
 'Bebić, Luka',
 'Kovačević, Pero',
 'Lalić, Ljubica',
 'Lesar, Dragutin',
 'Letica, Slaven',
 'Matušić, Frano',
 'Milinović, Darko',
 'Mršić, Zvonimir',
 'Pecek, Željko',
 'Pusić, Vesna',
 'Stazić, Nenad',
 'Šeks, Vladimir',
 'Šuker, Ivan']

In [8]:
merged.head(2)

Unnamed: 0,ID,Text,Title,From,To,House,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Speaker_role,Speaker_type,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth,Codemp,Codeparty,term2,codemp,order_id,term1_x,term_id,type_of_list,fullname,firstname,lastname,party,date_of_birth,year_of_birth,gender,place_of_birth,field_of_study,education_y,constituency,bp_lat,bp_lon,codeparty,term1_y,full_name,established,chairman,ideology_LR,party_family,election_result,no_seats,coalition,coalition_composition,ruling,sentences
0,ParlaMint-HR_T5.S1.u1,Cijenjene gospođe i gospod...,Minutes of the National As...,2003-12-22,2003-12-22,,5,1,-,,Izbor predsjednika Hrvatsk...,Reference,Chairperson,MP,HSS,Hrvatska seljačka stranka,Opposition,"Tomčić, Zlatko",M,1945.0,M702,P7,5,M702,351.0,2003-2007,159.0,not_active,"Tomčić, Zlatko",Zlatko,Tomčić,HSS,19450710,1945.0,0.0,Zagreb,2,16,2,45.815011,15.981919,P7,2003-2007,Hrvatska seljačka stranka,1989.0,Josip Friščić,3.0,1,7.15,9.0,0.0,-,0.0,[Cijenjene gospođe i gospo...
1,ParlaMint-HR_T5.S1.u2,"Gospodine predsjedavajući,...",Minutes of the National As...,2003-12-22,2003-12-22,,5,1,-,,Izbor predsjednika Hrvatsk...,Reference,Regular,MP,HDZ,Hrvatska demokratska zajed...,Coalition,"Šeks, Vladimir",M,1943.0,M638,P3,5,M638,322.0,2003-2007,130.0,normal,"Šeks, Vladimir",Vladimir,Šeks,HDZ,19430101,1943.0,0.0,Osijek,5,16,4,45.554962,18.695514,P3,2003-2007,Hrvatska demokratska zajed...,1989.0,Ivo Sanader,4.0,1,33.91,66.0,0.0,-,1.0,[Gospodine predsjedavajući...


In [27]:
merged.Meeting.unique()

['-', '1', '2', '3', '4', ..., '102', '103', '104', '105', '87; 88']
Length: 157
Categories (157, object): ['-', '1', '10', '100', ..., '99', '9; 10', '9; 10; 11', '9; 10; 11; 12']