In [1]:
# Imports
import os

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import re
import string

from unidecode import unidecode
from bs4 import BeautifulSoup as bs

In [2]:
# dir
xml_dir = r"D:\Debate_a_Base\Data\Parlamint_3.0\Originele_taal\original_extract"
word_count_csv = 'D:/Debate_a_Base/Data/xml_word_counts.csv'

df_words_import = pd.read_csv(word_count_csv)
df_words = df_words_import.groupby("country").sum()

In [3]:
# --------------------------------------------------------------------------------------
# Vereist list met folder van landen bv: ['ParlaMint-NL.TEI', "ParlaMint-BE.TEI"]
#
# ---Return list met dict [{Land1}, {Land 1}]---
#
# Voorbeeld voor land MET jaarfolders:
# item 1 --> 'land folder': [land xml] bv: 'ParlaMint-NL.TEI': ['ParlaMint-NL.xml']
# item rest --> 'ParlaMint-NL.TEI\\2014': ['ParlaMint-NL_2014-04-16-tweedekamer-2.xml', '...']
#
# Voorbeeld met land ZONDER jaarfolder:
# items (alle) --> {'ParlaMint-BG.TEI': ['ParlaMint-BG.xml', 'ParlaMint-BG_2014-10-27.xml', '...'}
#
# Ofwel len(dict) > 1 bij landen met jaarfolders, anders zit speciale file inbegrepen als eerste in dict
#
#  Path vinden met os.path.join(key, value[x])
# --------------------------------------------------------------------------------------

def get_xml_files(country_selection):
    os.chdir(xml_dir)
    
    country_return_list = []

    # loop door alle folders die hierboven zijn geprint
    for country in os.listdir():

        # filter op specifiek land (IN BOX 2)
        if country in country_selection:
            paths_dict = {}

            # ga door alle inhoud van de landfolder heen
            for root, dirs, files in os.walk(country):
                file_data = []
                
                # loop door files van een folder
                for file in files:

                    #filter alleen de xml files
                    if ".xml" in file and not "~" in file:
                        file_data.append(file)

                # filter onzin uit de dict voor makkelijkere processing later
                if not "Schema" in root:
                    paths_dict[root] = file_data

            country_return_list.append(paths_dict)
        
    return country_return_list

In [4]:
# haal jaar maand en dag uit filename
def extract_file_date(file_name):
    
    year_month_day = re.search(r"\d{4}-\d{2}-\d{2}", file_name)
#     year, month, day = year_month_day[0].split("-")
    
    if year_month_day != None:
    
        return year_month_day[0]
    
    else:
        
        return "29-12-2001"

In [5]:
def extract_wordcount(root, file):
    os.chdir(xml_dir)
    
    path = os.path.join(root, file)
    
    # lees het bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
    words = bs_content.find_all("measure")[-1].get("quantity")
    
    return words

In [6]:
# haalt nodige informatie uit het overkoepelende bestand dat per land bestaat
# NOTE: Deze functie gaat later overige nodige info over personen en partijen ook meegeven
# goede characteromvorming schrijven zodat puntjes op speciale letters niet vervormen
def extract_info_xml(root, org_file, person_file):
    os.chdir(xml_dir)
    
    path = os.path.join(root, org_file)
    
    # lees het organisation bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        
        bs_content = bs(content, "lxml")
        
        parties = 0
        for party in bs_content.find_all("org"):
            
            if party.get("role") == "parliamentaryGroup":
                parties += 1
    
    path = os.path.join(root, person_file)
    
    # lees het person bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        
        bs_content = bs(content, "lxml")
        
        persons = len(bs_content.find_all("person"))
    
    return persons, parties

In [7]:
# root, file = "ParlaMint-NL.TEI", "ParlaMint-NL.xml"
# country = root.replace("ParlaMint-", "").replace(".TEI", "")
# test_persons, test_parties = extract_info_xml(root, file)

# print(test_persons, test_parties)

In [7]:
# haal de nodige data uit de xml en zet het in een list dict
def extract_debate_xml(root, file):
    os.chdir(xml_dir)
    
    path = os.path.join(root, file)
    content_list = []
    file_dummy = file
    
    # lees het bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
    # ga door alle segmenten info heen
    userlines = bs_content.find_all("u")
    for line in userlines:
        
        # definieer segment
        seg = line.get("xml:id")

        # definieer content (deze nieuwe manier negeert notes)
        mergable_lines = line.find_all("seg")
        cnt = ""
        for mergable_line in mergable_lines:
            
            cnt += f"{mergable_line.text} "
    
        # add the line to the list of lines
        content_list.append(unidecode(cnt.lower().translate(str.maketrans('', '', string.punctuation))).translate(str.maketrans('', '', string.digits)).split(" "))
    
    return_list = []
    
    for u in content_list:
        
        return_list += u
    
    return return_list, len(content_list)

In [9]:
# root, file = "ParlaMint-BE.TEI\\2014", "ParlaMint-BE_2014-11-18-54-commissie-ic018x.xml"
# country = root.replace("ParlaMint-", "").replace(".TEI", "")
# test_words, test_u_length = extract_debate_xml(root, file)

# print(test_u_length)
# print(test_words)

In [14]:
def get_wordcount_df(selected_countries):
    
    # zet de df op voor de wordcount
    wordcount_df = pd.DataFrame({'country': [],
                                 'years': [],
                                 'debates': [],
                                 'speeches': [],
                                 'total_days': [],
                                 'total_words_en': [],
                                 'total_words': [],
                                 'unique_words': [],
                                 'unique_speakers': [],
                                 'unique_parties': []
                                })
    
    # vraag alle xml files op van de gegeven land(en)
    countries = get_xml_files(selected_countries)
    
    # process door elk land
    for country in countries:
        
        # split hoofdfolder van mogelijke subfolders van jaren
        rootfolder = (k := next(iter(country)), country.pop(k))
        for misleading_item in rootfolder[1]:
            if "listOrg" in misleading_item:
                organizations = misleading_item # normaal werkte dit ook rootfolder[1][0]
            elif "listPerson" in misleading_item:
                persons = misleading_item # rootfolder[1][1]
        
        # country label (bv: GB of NL of BE)
        cty = rootfolder[0].replace("ParlaMint-", "").replace(".TEI", "")
        
        #definieer variabelen om te tellen
        unique_persons, unique_parties = extract_info_xml(rootfolder[0], organizations, persons)
        
        debates = 0
        speeches = 0 #wip
        total_days = []
        total_word_count = 0
        unique_word_count = [] #wip

        for year_folder in country:

            for file in country[year_folder]:
                words, file_speeches = extract_debate_xml(year_folder, file)

                debates += 1
                speeches += file_speeches
                total_days.append(extract_file_date(file))
                total_word_count += int(extract_wordcount(year_folder, file))
                unique_word_count += words
                
        total_word_count_en = int(df_words[df_words.index == cty]["words"])
        years = len(country)
            
        wordcount_df.loc[len(wordcount_df.index)] = [cty,
                                                     years,
                                                     debates,
                                                     speeches,
                                                     len(list(dict.fromkeys(total_days))),
                                                     total_word_count_en,
                                                     total_word_count,
                                                     len(list(set(unique_word_count))),
                                                     unique_persons,
                                                     unique_parties]
        
    return wordcount_df

In [15]:
countries = ['ParlaMint-AT.TEI',
              'ParlaMint-BA.TEI',
              'ParlaMint-BE.TEI',
              'ParlaMint-BG.TEI',
              'ParlaMint-CZ.TEI',
              'ParlaMint-DK.TEI',
              'ParlaMint-EE.TEI',
              'ParlaMint-ES-CT.TEI',
              'ParlaMint-ES-GA.TEI',
              'ParlaMint-FR.TEI',
              'ParlaMint-GB.TEI',
              'ParlaMint-GR.TEI',
              'ParlaMint-HR.TEI',
              'ParlaMint-HU.TEI',
              'ParlaMint-IS.TEI',
              'ParlaMint-IT.TEI',
              'ParlaMint-LV.TEI',
              'ParlaMint-NL.TEI',
              'ParlaMint-NO.TEI',
              'ParlaMint-PL.TEI',
              'ParlaMint-PT.TEI',
              'ParlaMint-RS.TEI',
              'ParlaMint-SE.TEI',
              'ParlaMint-SI.TEI',
              'ParlaMint-TR.TEI',
              'ParlaMint-UA.TEI']

# countries = ['ParlaMint-LV.TEI']

display(get_wordcount_df(countries))

Unnamed: 0,country,years,debates,speeches,total_days,total_words_en,total_words,unique_words,unique_speakers,unique_parties
0,AT,27,1197,227991,755,67282187,59916338,610985,853,9
1,BA,25,743,126252,672,21737189,18307498,187614,603,40
2,BE,9,2349,199305,943,42595150,44372160,271482,787,66
3,BG,9,921,210018,921,30180386,26471533,195729,1033,19
4,CZ,10,6328,181310,683,33760443,27933321,224604,592,19
5,DK,9,947,398610,862,43331948,40797597,211575,383,19
6,EE,12,1317,227872,1317,31552302,22874712,413979,488,6
7,ES-CT,8,286,50327,281,16163784,15972976,155083,364,21
8,ES-GA,8,302,83078,289,18843085,17837709,136326,227,7
9,FR,6,1564,714860,739,47013491,49629745,173170,908,26


In [None]:
# csv_loc = 'D:/Debate_a_Base/Data/xml_word_counts.csv'

# df_words_import = pd.read_csv(csv_loc)

# summed = df_words_import.groupby("country").sum()

# print(int(summed[summed.index == "LV"]["words"]))

In [15]:
def extract_debate_xml(root, file):
    os.chdir(xml_dir)
    
    path = os.path.join(root, file)
    
    word_count = 0
    
    # lees het bestand
    with open(path, "r", encoding="utf-8") as file:
        # read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
    # ga door alle segmenten info heen
    userlines = bs_content.find_all("u")
    for line in userlines:
        
        # definieer segment
        seg = line.get("xml:id")

        # definieer content (deze nieuwe manier negeert notes)
        mergable_lines = line.find_all("seg")
        cnt = ""
        for mergable_line in mergable_lines:
            
            cnt += f"{mergable_line.text} "
    
        # tel word count per speech
        word_count += len(cnt.split(" "))
    
    return word_count

In [22]:
countries = ['ParlaMint-AT.TEI',
              'ParlaMint-BA.TEI',
              'ParlaMint-BE.TEI',
              'ParlaMint-BG.TEI',
              'ParlaMint-CZ.TEI',
              'ParlaMint-DK.TEI',
              'ParlaMint-EE.TEI',
              'ParlaMint-ES-CT.TEI',
              'ParlaMint-ES-GA.TEI',
              'ParlaMint-FR.TEI',
              'ParlaMint-GB.TEI',
              'ParlaMint-GR.TEI',
              'ParlaMint-HR.TEI',
              'ParlaMint-HU.TEI',
              'ParlaMint-IS.TEI',
              'ParlaMint-IT.TEI',
              'ParlaMint-LV.TEI',
              'ParlaMint-NL.TEI',
              'ParlaMint-NO.TEI',
              'ParlaMint-PL.TEI',
              'ParlaMint-PT.TEI',
              'ParlaMint-RS.TEI',
              'ParlaMint-SE.TEI',
              'ParlaMint-SI.TEI',
              'ParlaMint-TR.TEI',
              'ParlaMint-UA.TEI']

counts_dict = {}
for country, country_name in zip(get_xml_files(countries), countries):
    rootfolder = (k := next(iter(country)), country.pop(k))
    
    country_word_count = 0
    for year_folder in country:
        
        for file in country[year_folder]:
#             print(year_folder)
#             root = os.path.join(rootfolder[0], year_folder)
            country_word_count += extract_debate_xml(year_folder, file)
    
    counts_dict[country_name] = country_word_count

In [26]:
for k, v in counts_dict.items():
    print(k.replace("ParlaMint-", "").replace(".TEI", ""), v)

AT 65556247
BA 18456149
BE 43396828
BG 26875735
CZ 28217347
DK 41348781
EE 23143792
ES-CT 15563367
ES-GA 17922327
FR 48875696
GB 125393511
GR 50324704
HR 87968016
HU 27572200
IS 31339093
IT 31421613
LV 9468425
NL 67678348
NO 89278647
PL 36086410
PT 17624012
RS 85101931
SE 29108063
SI 70302609
TR 45053313
UA 18603807
