In [None]:
# Imports
import os

import warnings
warnings.filterwarnings('ignore')

from bs4 import BeautifulSoup as bs
from elasticsearch import Elasticsearch

import re
import pandas as pd

In [None]:
# Elastic host
es = Elasticsearch(
    hosts=[
            "https://localhost:9200"
    ],
    basic_auth=("elastic", "NES9DZ-QwhanXAQf9caV"),
#     use_ssl=True,
    verify_certs=False,
#     ca_certs="./ca.crt"
)

# dir
translated_csv_dir = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Data_Translated_CSV"
xml_dir = r"C:\Users\Asher\Documents\School\_Scriptie\Data\Data_Extract"

In [None]:
# return list met dict [{Land1}, {Land 1}]
def get_csv_files(country_selection):
    os.chdir(translated_csv_dir)
    
    country_return_list = []

    # loop door alle folders die hierboven zijn geprint
    for country in os.listdir():
        
        # filter op specifiek land (IN BOX 2)
        if country in country_selection:
            paths_dict = {}

            # ga door alle inhoud van de landfolder heen
            for root, dirs, files in os.walk(country):
                file_data = []
                
                # loop door files van een folder
                for file in files:
                    
                    file_data.append(file)

                paths_dict[root] = file_data

            country_return_list.append(paths_dict)
        
    return country_return_list

# idem voor deze functie -> return list met dict [{Land1}, {Land 1}]
def get_xml_files(country_selection):
    os.chdir(xml_dir)
    
    country_return_list = []

    # loop door alle folders die hierboven zijn geprint
    for country in os.listdir():

        # filter op specifiek land (IN BOX 2)
        if country in country_selection:
            paths_dict = {}

            # ga door alle inhoud van de landfolder heen
            for root, dirs, files in os.walk(country):
                file_data = []
                
                # loop door files van een folder
                for file in files:

                    #filter alleen de xml files
                    if ".xml" in file and not "~" in file:
                        file_data.append(file)

                # filter onzin uit de dict voor makkelijkere processing later
                if not "Schema" in root:
                    paths_dict[root] = file_data

            country_return_list.append(paths_dict)
        
    return country_return_list

In [None]:
# functie testruimte
# selected_countries = ["NL"]

# answer = get_csv_files(selected_countries)

# print(answer)

# selected_countries = ["ParlaMint-NL.TEI"]

# answer = get_xml_files(selected_countries)

# print(answer)

In [None]:
# NOTE: input is {search_key: value, search_key2: value2}
# query function
def query(search_dict):
    processed_search_list = []
    
    # loop door alle search elements heen
    for k, v in search_dict.items():
        processed_search_list.append({"match_phrase" : {k : v}})
        
    # stel de uitkomst samen
    result = es.search(
    index = "search",
    size = 1000, # TODO: Zorg dat er een groter limit is dan 10000
    query = {
        "bool" : {
            "must": processed_search_list,},})
    
    return result

In [None]:
# update een entry (gaat uit van de 'search' index)
# NOTE: new_line is een {key : value}
def update(entry_id, new_line):

    es.update(index = "search",
              id = entry_id,
                    body = {"doc": new_line})

In [None]:
# geeft dict met uit welke segs een userline bestaat
def get_seg_combinations(root, file):
    os.chdir(xml_dir)
    
    path = os.path.join(root, file)
    
    # lees het bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
    userlines = bs_content.find_all("u")
    
    seg_combinations = {}
    
    # voeg elke seg value toe aan de bijpassende userline
    for userline in userlines:
        
        segs = []
        
        for seg in userline.find_all("seg"):
            
            segs.append(seg.get("xml:id"))
        
        seg_combinations[userline.get("xml:id")] = segs
        
    return seg_combinations

In [None]:
# functie testruimte
root, file = "ParlaMint-NL.TEI\\2014", "ParlaMint-NL_2014-04-16-tweedekamer-2.xml"

test_combis = get_seg_combinations(root, file)

for k, v in test_combis.items():
    
    if "ParlaMint-NL_2014-04-16-tweedekamer-2.seg8" in v:
        print(k, v)

In [None]:
# voeg segmenten samen voor snellere processing later
def merge_csv_segs(seg_combinations, df):
    merged_dict = {}
    
    # deze functie voert per dataframe line uit
    def iter_csv(x):
        k = x["key"]
        v = x["value"]
        
        # loop door alle userlines van de originele xml heen
        for userline, seg in seg_combinations.items():
            
            # filter de seg die overeenkomt met de translated line
            if k in seg:
                
                # voeg de seg toe aan de gehele userline
                if userline not in merged_dict.keys():
                    
                    merged_dict[userline] = v
                    
                else:
                        
                    merged_dict[userline] = f"{merged_dict[userline]} {v}"
    
        return
    
    df.apply(iter_csv, axis=1)
    
    return merged_dict

In [None]:
df_for_the_test = pd.read_csv(translated_csv_dir + r"\NL\ParlaMint-NL_2014-04-16-tweedekamer-2.csv")

test_merged_dict = merge_csv_segs(test_combis, df_for_the_test)

for k, v in test_merged_dict.items():
    print(k, v, "\n")

In [None]:
# verwerkt een merged translated dict
def update_dict_to_es(translations_dict):
    
    for k, v in translations_dict.items():
        
        entry_id = query({"segment":k})["hits"]["hits"][0]["_id"]

        update(entry_id, {"content_translated":v})

In [None]:
# functie testruimte
update_test_one_file = update_dict_to_es(test_merged_dict)

In [None]:
# TODO: update alle data van een land aan de hand van een lijst
def update_country_translations(folders):
    
    for csv_folder, tei_folder in folders.items():
        
        csv_dir = get_csv_files([csv_folder])
        
        for csv_file in csv_dir[0][csv_folder]:
            
            # TODO: uitzondering voor HR
            # TODO: jaarfolder check maken
            
            year = f"{tei_folder}\\{re.search(r'[1-3][0-9]{3}', csv_file)[0]}"
            file = csv_file.split(".csv")[0] + ".xml"
            
            csv_seg_combinations = get_seg_combinations(year, file)
            print(csv_seg_combinations)
#             return

In [None]:
# ---- hier gebeurt het echte werk ----

# lists moeten even lang zijn
country_folders = {"NL":"ParlaMint-NL.TEI"}

update_country_translations(country_folders)

In [None]:
# testruimte om te kijken naar geupdate lines

# test_result = query({"segment":"ParlaMint-NL_2014-04-16-tweedekamer-2.u6"})

# print("Got %d Hits:" % test_result['hits']['total']['value'])

# for hit in test_result['hits']['hits']:
#     print(hit, "\n")
#     print("%(person)s (%(party)s) \n(%(year)s-%(month)s-%(day)s) %(segment)s:\n %(content)s \n" % hit["_source"])

test_result = query({"segment":"ParlaMint-NL_2014-04-16-tweedekamer-2.u1"})

print(test_result)

# print("Got %d Hits:" % test_result['hits']['total']['value'])

# for hit in test_result['hits']['hits']:
#     print(hit, "\n")
#     print("%(person)s 