# Debate a base
#### Update script (Parlamint 4.1)


### Functions
- Uploads all Parlamint data to ElasticSearch for the `debates` page

### Don't forget to:
- Make sure that nothing else inside of the `data/original/EU` and `data/original/EN` folders besides what was mentioned in the step-by-step guide
- Fill in `es_host`, `es_user` and `es_password` so you can connect with your ElasticSearch instance

In [6]:
# Fill in credentials over here!
es_host = "https://localhost:9200"
es_user = "CHANGEME"
es_password = "CHANGEME"

In [7]:
# Local paths according to the step-by-step guide
original_set = "../data/original/EU"
translated_set = "../data/original/EN"

# Setup elastic host connection
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch(
    hosts=[
        es_host
    ],
    http_auth=(es_user, es_password),
#     use_ssl=True,
    verify_certs=False,
#     ca_certs="./ca.crt"
)

### Delete previous version of debates index on your ElasticSearch instance
- Do this in case your `search` index for the debates page is not empty
- Delete the "search" index manually or execute the following command
- Don't forget to make it a comment again after executing it

In [76]:
# es.indices.delete(index='search')

### Execution
- Click "Kernel" > "Restart & run all"
- At the bottom, below the last MarkDown cell, it should be printing information about files being uploaded to ElasticSearch
- THIS TAKES A VERY LONG TIME

In [46]:
# Imports
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import re
from unidecode import unidecode
from bs4 import BeautifulSoup as bs

from dataclasses import dataclass

import tarfile
import shutil

import time

In [1]:
# Extract data from .tgz
original_set_extract = "../EU"
translated_set_extract = "../EN"

# Removes obsolete extracted files (must be updated to work again)
# def clean_folder(path):
#     os.chdir(path)
    
#     for folder in os.listdir():
#         if ".TEI" not in folder:
#             shutil.rmtree(os.path.join(path, folder))

# # Original data
# os.chdir(original_set)

# if "EU" not in os.listdir("../../preprocessed"):
#     for folder in os.listdir():
#         if folder != "ParlaMint-4.1.tar.gz":
#             folder_extract = tarfile.open(folder)
            
#             folder_extract.extractall("../../preprocessed/EU")
#             folder_extract.close()
#     print("[Info]: EU created")
# else:
#     print("[Info]: EU already exists")

# clean_folder(original_set_extract)
            
# # Translated data
# os.chdir(translated_set)

# if "EN" not in os.listdir("../../preprocessed"):
#     for folder in os.listdir():
#         if folder != "ParlaMint-4.1-en.tar.gz":
#             os.chdir(translated_set)
            
#             folder_extract = tarfile.open(folder)
            
#             folder_extract.extractall("../../preprocessed/EN")
#             folder_extract.close()
            
#             clean_folder(translated_set_extract)
#     print("[Info]: EN created")
# else:
#     print("[Info]: EN already exists")

In [29]:
# Retrieve the file structure
def get_xml_files(country_selection):
    country_return_list = []

    # loop door alle folders die hierboven zijn geprint
    for country in os.listdir():

        # filter op specifiek land (IN BOX 2)
        if country in country_selection:
            paths_dict = {}

            # ga door alle inhoud van de landfolder heen
            for root, dirs, files in os.walk(country):
                file_data = []
                
                # loop door files van een folder
                for file in files:

                    #filter alleen de xml files
                    if ".xml" in file and not "~" in file:
                        file_data.append(file)

                # filter onzin uit de dict voor makkelijkere processing later
                if not "Schema" in root:
                    paths_dict[root] = file_data

            country_return_list.append(paths_dict)
        
    return country_return_list

In [30]:
# functie testruimte
# selected_countries = ["ParlaMint-NL.TEI"]

# answer = get_xml_files(selected_countries)

# print(answer)

In [49]:
# Retrieve person info
@dataclass
class Person:
    p_id: str
    name: str
    party: str
    sex: str

# haalt nodige informatie uit het overkoepelende bestand dat per land bestaat
# NOTE: Deze functie gaat later overige nodige info over personen en partijen ook meegeven
# goede characteromvorming schrijven zodat puntjes op speciale letters niet vervormen
def extract_info_xml(root, org_file, person_file):
    path = os.path.join(root, org_file)
    person_dict, party_dict = {}, {}
    
    # lees het party bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        
        bs_content = bs(content, "html.parser")
    
        # maakt dict met --> partij id (key) : partij naam (value)
        for party in bs_content.find_all("org"):
            
            # definieer partij afkorting en volledige naam
            part_y_1 = party.find_all("orgname")[-1].text #party.get("xml:id").split(".")[-1]
            part_y_2 = party.find_all("orgname")
            
            # kies met voorkeur de engelse variant van party
            for orgname in part_y_2:
                if orgname.get("full") == "yes" and orgname.get("xml:lang") == "en":
                    part_y_2 = orgname.text
                    break
            
            # kies anders de full variant
            else:
                
                for orgname in part_y_2:
                    
                    if orgname.get("full") == "yes":
                        part_y_2 = orgname.text
                        break
                
                # pak de eerste partijnaam als er geen betere zijn
                else:
                    part_y_2 = part_y_2[0].text
            
            # kijk welke langer is en zet die achteraan
            if len(part_y_1) > len(part_y_2):
                party_dict[party.get("xml:id")] = f"{part_y_2} ({part_y_1})" 
            else:
                party_dict[party.get("xml:id")] = f"{part_y_1} ({part_y_2})"
    
    # lees het person bestand
    path = os.path.join(root, person_file)
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        
        bs_content = bs(content, "html.parser")
    
    # krijg de juiste achternaam
    def get_surname(person):
        # pak de 2e achternaam als die er is (find_all -1 kan niet omdat er in dat geval ook 4 kunnen zijn)
        if len(person.find_all('surname')) == 1:
            return person.find('surname').text
        else:
            return person.find_all('surname')[1].text
        
    # vind de juiste partij
    def get_affiliation(person):
        
        # ga van nieuw-oud door affiliations heen
        for affiliation in reversed(person.find_all("affiliation")):
    
            # return party als die ook in party_dict staat
            if affiliation.get('ref') is not None and affiliation.get('ref').strip("#") in party_dict:
                
                return party_dict[affiliation.get('ref').strip("#")]
    
    # ga door alle user info heen
    persons = bs_content.find_all("person")
    for person in persons:
        
        # NOTE: person.find_all('surname')[-1] voor laatste surname
        if person.find("affiliation") != None:
            
            pers = Person(person.get("xml:id"), 
                          (person.find('forename').text + " " + get_surname(person)),
                          get_affiliation(person),
                          (person.find("sex").get('value') if person.find("sex") is not None else "None"))
            
            person_dict[person.get("xml:id")] = pers
        
        # dit zorgt er voor dat mensen zonder partij als None worden gezet
        elif person.find('forename') != None:
            
            pers = Person(person.get("xml:id"), 
                          (person.find('forename').text + " " + get_surname(person)),
                          "None",
                          (person.find("sex").get('value') if person.find("sex") is not None else "None"))
            
            person_dict[person.get("xml:id")] = pers
                
        # neem xml:id als naam
        else:

            pers = Person(person.get("xml:id"), 
                          person.get("xml:id"),
                          "None",
                          "None")
            
            person_dict[person.get("xml:id")] = pers
            
    return person_dict

In [51]:
# functie testruimte
# root, file_org, file_person = "ParlaMint-NL.TEI", "ParlaMint-NL-listOrg.xml", "ParlaMint-NL-listPerson.xml"
# root, file_org, file_person = "ParlaMint-TR.TEI", "ParlaMint-TR-listOrg.xml", "ParlaMint-TR-listPerson.xml"

# test_person_dict = extract_info_xml(root, file_org, file_person)

# print(test_person_dict)

In [12]:
# Retrieve the date of a file
def extract_file_date(file_name):
    year_month_day = re.search(r"\d{4}-\d{2}-\d{2}", file_name)
    year, month, day = year_month_day[0].split("-")
    
    return year, month, day

In [73]:
# haal de nodige data uit de xml en zet het in een list dict
def extract_debate_xml(root, file, country_info_dict, cty, translation_df=None, count=0, index="search"):
    # os.chdir(original_set)
    
    path = os.path.join(root, file)
    content_dict_list = []
    file_dummy = file
    
    # lees het bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "html.parser")
    
    # stel datum vast
    year, month, day = extract_file_date(file.name)
        
    # ga door alle segmenten info heen
    userlines = bs_content.find_all("u")
    for line in userlines:
        
        content_dict, es_content_dict = {}, {}
        
        # prs[0] = naam, prs[1] = partij, prs[2] = gender
        prs = None
        
        # definieer segment
        seg = line.get("xml:id")

        # definieer content (deze nieuwe manier negeert notes)
        mergable_lines = line.find_all("seg")
        cnt = ""
        for mergable_line in mergable_lines:
            
            cnt += f"{mergable_line.text} "
            
        cnt_s = unidecode(cnt)
        
        # kijk of er gegevens van de spreker zijn
        if line.get("who") != None:
            prs = country_info_dict[line.get("who").replace('#', '')]
            
            content_dict["person"] = prs.name
            content_dict["person_simplified"] = unidecode(prs.name)
            content_dict["party"] = prs.party
            content_dict["gender"] = prs.sex
            
        else:
            
            content_dict["person"] = "None"
            content_dict["person_simplified"] = "None"
            content_dict["party"] = "None"
            content_dict["gender"] = "None"
        
        content_dict["file"] = file_dummy
        content_dict["segment"] = seg
        content_dict["year"] = year
        content_dict["month"] = month
        content_dict["day"] = day
        content_dict["position"] = line.get("ana")
        content_dict["country"] = cty
        content_dict["content"] = cnt
        content_dict["content_simplified"] = cnt_s.replace("\'", "")
        
        # voeg de translated line toe als die er is
        if translation_df is not None and seg in translation_df.keys():
            
            translated_line = translation_df[seg]
            
            # kijk of de userline bestaat
            if len(translated_line) != 0:
            
                content_dict["content_translated"] = translated_line
            
            # userline bevat geen gesproken teksts
            elif len(cnt) == 1:
                
                continue
            
            # wel gesproken teksts maar niet opgepakt door translater
            else:
                
                content_dict["content_translated"] = "None"
                print(f"[Missing translation]: {seg}")
        
        # sla engels ook op zodat er op de site geen exception nodig is
        elif cty == "GB":
            content_dict["content_translated"] = cnt
        
        # voeg nodige info toe aan elke xml_content_dict line voor bulk uploads
        count += 1
        
        es_content_dict["_index"] = index
        es_content_dict["_id"] = count
        es_content_dict["_source"] = content_dict
    
        # add the line to the list of lines
        content_dict_list.append(es_content_dict)
    
    return content_dict_list, count

In [None]:
# # functie testruimte
# root, file = os.path.join(original_set_extract, "ParlaMint-BE.TEI\\2019"), "ParlaMint-BE_2019-01-16-54-commissie-ic1017x.xml"
# country_info_dict = extract_info_xml("ParlaMint-BE.TEI", "ParlaMint-BE-listOrg.xml", "ParlaMint-BE-listPerson.xml")
# # test_translated_df = get_translated_csv("BE", file.split(".xml")[0] + ".csv")

# # TODO: Check of bij sommige COUNTRY\\JAARTAL staat ipv COUNTRY
# country = root.replace("ParlaMint-", "").replace(".TEI", "")
# test_json = extract_debate_xml(root, file, country_info_dict, country)#, extract_translated_csv(country))

# print(test_json[0])

In [59]:
# vraag een dict met k (userline) v (sentence)
def get_translated_dict(path):
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "html.parser")

    # ga door alle segmenten info heen
    userlines = bs_content.find_all("u")
    segments = dict()
    for line in userlines:
        seg = line.get("xml:id")
        
        # combineer de woorden tot een zin
        combined_line = ""
        for part_of_line in line.find_all("seg"):
            combined_line += part_of_line.text
            combined_line = combined_line.replace("\n\n", " ").replace("\n", " ").replace("  ", " ")
        
        # zet de interpunctie goed
        start_index = 0
        punctuated_line = ""
        for dot in re.finditer("\.|,|\?|\!|\)|;", combined_line):
            index = dot.start()

            if combined_line[index - 1] == " ":
                punctuated_line += combined_line[start_index:(index - 1)] + combined_line[index]

                start_index = (index + 1)
        
        # voorkom dat zinnen zonder interpunctie missing gaan
        if punctuated_line == "":
            punctuated_line = combined_line

        # haal de spatie op het begin weg
        if len(punctuated_line) > 0 and punctuated_line[0] == " ":
            punctuated_line = punctuated_line[1:]
            
        # sla de zin op met seg als key
        segments[seg] = punctuated_line.replace("  ", " ")

    return segments

In [None]:
# functie testruimte
# translate_path_test = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Parlamint_3.0\Overkoepelende_taal\translated_extract\ParlaMint-NL-en.TEI.ana\2014\ParlaMint-NL-en_2014-04-16-tweedekamer-2.ana.xml"
# translate_path_test = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Parlamint_3.0\Overkoepelende_taal\translated_extract\ParlaMint-BA-en.TEI.ana\2009\ParlaMint-BA-en_2009-09-16-0.ana.xml"
# translate_path_test = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Parlamint_3.0\Overkoepelende_taal\translated_extract\ParlaMint-BG-en.TEI.ana\2014\ParlaMint-BG-en_2014-11-05.ana.xml"

# for k, v in get_translated_dict(translate_path_test).items():
#     print(k, "\n<" + v + ">\n\n")

In [89]:
# Upload xml files
def upload_xmls(selected_countries, cnt=0):
    
    # vraag alle xml files op van de gegeven land(en)
    countries = get_xml_files(selected_countries)
    
    # loop door elk land heen
    for country in countries:
        
        # split hoofdfolder van mogelijke subfolders van jaren
        rootfolder = (k := next(iter(country)), country.pop(k))
        for misleading_item in rootfolder[1]:
            if "listOrg" in misleading_item:
                organizations = misleading_item # normaal werkte dit ook rootfolder[1][0]
            elif "listPerson" in misleading_item:
                persons = misleading_item # rootfolder[1][1]
        
        # verzamel person info
        country_info = extract_info_xml(rootfolder[0], organizations, persons)
        
        # country label (bv: GB of NL of BE)
        cty = rootfolder[0].replace("ParlaMint-", "").replace(".TEI", "")
        translated_folder = [translated_folder for translated_folder in os.listdir(translated_set_extract) if cty in translated_folder]
        
        for year_folder in country:

            for file in country[year_folder]:

                # vraag de vertaalde bijbehorende file op (krijgt None als die er niet is)
                # TODO: mogelijk betere check maken die ook kijkt of de file bestaat (ipv alleen folder)
                if len(translated_folder) > 0:
                    
                    # ES is not rightfully selected between ES-CT and ES-GA so we handle it manually
                    if not cty == "ES":
                        path = os.path.join(*[translated_set_extract, translated_folder[0], year_folder.split('\\')[1], file.replace(".xml", ".ana.xml").replace("_", "-en_")])
                    else:
                        path = os.path.join(*[translated_set_extract, translated_folder[1], year_folder.split('\\')[1], file.replace(".xml", ".ana.xml").replace("_", "-en_")])
                    translated_dict = get_translated_dict(path)
                    
                else:
                    translated_dict = None

                # verkrijg een verwerkt xml debat file en update counter
                processed_xml, cnt = extract_debate_xml(year_folder, file, country_info, cty, translated_dict, cnt)

                # testing area 
#                 print("\n", processed_xml[0], "\n\n")
#                 break
#             break
                print(f"[Uploaded]: {file} (id: {cnt})")
                helpers.bulk(es, processed_xml)

        print("---------------------------------------------------------------\n\n")
        print(cty + " is uploaded to elastic\n\n")
        print("---------------------------------------------------------------\n\n")
    
    return

### Everything is according to plan if the cell below shows print statements of files being uploaded

In [None]:
# os.listdir("../data/original/EU")

selected_countries = ['ParlaMint-AT.TEI',
                    'ParlaMint-BA.TEI',
                    'ParlaMint-BE.TEI',
                    'ParlaMint-BG.TEI',
                    'ParlaMint-CZ.TEI',
                    'ParlaMint-DK.TEI',
                    'ParlaMint-EE.TEI',
                    'ParlaMint-ES-CT.TEI',
                    'ParlaMint-ES-GA.TEI',
                    'ParlaMint-ES-PV.TEI',
                    'ParlaMint-ES.TEI',
                    'ParlaMint-FI.TEI',
                    'ParlaMint-FR.TEI',
                    'ParlaMint-GB.TEI',
                    'ParlaMint-GR.TEI',
                    'ParlaMint-HR.TEI',
                    'ParlaMint-HU.TEI',
                    'ParlaMint-IS.TEI',
                    'ParlaMint-IT.TEI',
                    'ParlaMint-LV.TEI',
                    'ParlaMint-NL.TEI',
                    'ParlaMint-NO.TEI',
                    'ParlaMint-PL.TEI',
                    'ParlaMint-PT.TEI',
                    'ParlaMint-RS.TEI',
                    'ParlaMint-SE.TEI',
                    'ParlaMint-SI.TEI',
                    'ParlaMint-TR.TEI',
                    'ParlaMint-UA.TEI']


# NOTE: If this process gets interrupted, copy the id for the "cnt" variable from the last .xml file from the last country that was uploaded completely. Also remove all completed countries from list above
start_time = time.time()
upload_xmls(selected_countries, cnt=0)
print("--- %s seconds ---" % (time.time() - start_time))