# Data Processing

In [9]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os

from bs4 import BeautifulSoup

def parse_data_to_df(file):

    xml_data = open(file, 'r').read()  # Read file
    soup = BeautifulSoup(xml_data ,'xml')

    combinend_tgsordpkt = []
    combinend_speaker = []
    combinend_party = []
    combinend_speech = []

    # get the date of the Plenarsitzung 
    date = [soup.find("datum").attrs["date"]]

    tagesordnungspunkt = soup.find_all('tagesordnungspunkt')
    for punkt in tagesordnungspunkt:

        # ignore tagesordnungspunkt if there is no redner
        check_for_redner = len(punkt.find_all("rede")) != 0
        if check_for_redner:

            tgsordpkt_all_text = [] #list to store the text from the speeches joined as one string per speaker
            tgsordpkt_all_speaker = [] #list to store the text from the speeches joined as one string per speaker
            tgsordpkt_thema = []
            tgsordpkt_speaker_party = []
            
            thema = punkt.find('p', {"klasse": "T_fett"})
            if thema is None:
                continue
            thema_txt = thema.get_text()
            tgsordpkt_thema.append(thema_txt)
            reden = punkt.find_all('rede')
            for rede in reden:
                # extract the text from the speeches
                plain_text = []
                text = rede.find_all('p')
                ignore_tags = ["redner"]
                for txt in text:
                    #print(txt.attrs)
                    if txt.has_attr('klasse'):
                        if txt["klasse"] in ignore_tags:
                            continue
                    plain_text.append(txt.get_text())
                joined_text = ' '.join(plain_text)
                tgsordpkt_all_text.append(joined_text)

                # extract the speaker "redner" of the speech
                redner = rede.find('p', {"klasse": "redner"})
                store_redner_info = []
                for re in redner:
                    store_redner_info.append(re.get_text())

                get_speaker = store_redner_info[-1]
                tgsordpkt_all_speaker.append(get_speaker)

                #extract party from redner
                if "(" in get_speaker: 
                    count = 0
                    while True:
                        count = count+1
                        character = get_speaker[-count]
                        if character == "(":
                            tgsordpkt_speaker_party.append(get_speaker[-count:])
                            break       
                else:
                    tgsordpkt_speaker_party.append('N/A')

            if len(tgsordpkt_thema) < len(tgsordpkt_all_speaker):
                while len(tgsordpkt_thema) < len(tgsordpkt_all_speaker):
                    tgsordpkt_thema.append(tgsordpkt_thema[0])

            combinend_tgsordpkt.append(tgsordpkt_thema)
            combinend_speaker.append(tgsordpkt_all_speaker)
            combinend_speech.append(tgsordpkt_all_text)
            combinend_party.append(tgsordpkt_speaker_party)

    combinend_tgsordpkt = [item for sublist in combinend_tgsordpkt for item in sublist]
    combinend_speaker = [item for sublist in combinend_speaker for item in sublist]
    combinend_speech = [item for sublist in combinend_speech for item in sublist]
    combinend_party = [item for sublist in combinend_party for item in sublist]

    if len(date) < len(combinend_tgsordpkt):
        while len(date) < len(combinend_tgsordpkt):
            date.append(date[0])
    
    test_for_content = combinend_tgsordpkt + combinend_speaker + combinend_speech + combinend_party    
    if len(test_for_content) != 0:
        # dictionary of lists 
        dict = {
            'date': date ,
            'thema': combinend_tgsordpkt, 
            'speaker': combinend_speaker, 
            'party': combinend_party,
            'speech': combinend_speech
        } 

        df = pd.DataFrame(dict)
        return df

In [10]:
# assign directory
directories = ['Plenarprotokolle_Wahlperiode_19', 'Plenarprotokolle_Wahlperiode_20']
 
# iterate over files in
# that directory

dfs = []
for directory in directories: 
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            get_df = parse_data_to_df(f)
            dfs.append(get_df)
        
final_df = pd.concat(dfs, ignore_index=True)

final_df

Unnamed: 0,date,thema,speaker,party,speech
0,29.06.2018,Finanzhilfen zugunsten Griechenlands:,"Olaf Scholz, Bundesminister der Finanzen:",,Herr Präsident! Meine Damen und Herren! Griech...
1,29.06.2018,Finanzhilfen zugunsten Griechenlands:,Peter Boehringer (AfD):,(AfD):,Herr Präsident! Liebe Kolleginnen! Liebe Kolle...
2,29.06.2018,Finanzhilfen zugunsten Griechenlands:,,,Herr Präsident! Liebe Kolleginnen und Kollegen...
3,29.06.2018,Finanzhilfen zugunsten Griechenlands:,Christian Dürr (FDP):,(FDP):,Herr Präsident! Meine sehr verehrten Kolleginn...
4,29.06.2018,Finanzhilfen zugunsten Griechenlands:,Fabio De Masi (DIE LINKE):,(DIE LINKE):,Herr Präsident! Sehr geehrte Damen und Herren!...
...,...,...,...,...,...
22429,18.03.2022,Haltung der Bundesregierung zu den Vorwürfen g...,Julia Klöckner (CDU/CSU):,(CDU/CSU):,Frau Präsidentin! Liebe Kolleginnen und Kolleg...
22430,18.03.2022,Haltung der Bundesregierung zu den Vorwürfen g...,Dr. Till Steffen (BÜNDNIS 90/DIE GRÜNEN):,(BÜNDNIS 90/DIE GRÜNEN):,Frau Präsidentin! Meine Damen und Herren! Ich ...
22431,18.03.2022,Haltung der Bundesregierung zu den Vorwürfen g...,Sandra Bubendorfer-Licht (FDP):,(FDP):,Sehr geehrte Frau Präsidentin! Meine sehr geeh...
22432,18.03.2022,Haltung der Bundesregierung zu den Vorwürfen g...,Mario Czaja (CDU/CSU):,(CDU/CSU):,Sehr geehrte Frau Präsidentin! Liebe Kolleginn...


In [None]:
#import sys
#!/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install textblob
#!python3 -m textblob.download_corpora
#from textblob_de import TextBlobDE as TextBlob
#!pip install textblob_de
#!pip install --user -U nltk
#!/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install spacy
#!/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip setuptools wheel
#!pip install spacy-sentiws
#import spacy
#!/Library/Developer/CommandLineTools/usr/bin/python3 -m spacy download de_core_news_sm

# TextBlob (German)

In [None]:
from textblob_de import TextBlobDE as TextBlob
from pandarallel import pandarallel  # parallelization
pandarallel.initialize()

def get_polarity(input_speech):
    speech = TextBlob(input_speech)
    return speech.sentiment.polarity

df["polarity_textblob"] = df.parallel_apply(lambda row: get_polarity(row["speech"]), axis=1)
df

# VADER

In [None]:
#translate text to english because vader is only working properly in english language
from deep_translator import GoogleTranslator

#translate text to english language
def translate_en(text):
    translated = GoogleTranslator(source='auto', target='en').translate(text)
    return translated


def translate_speech(text):
    #split text after 4999 characaters, translate both path separatly and join them afterwards
    if len(text) > 5000: 
        cut_text= text[:4999]
        for idx, c in enumerate(reversed(cut_text)):
            #print(idx, c)
            if c == " ":
                print(idx, c)
                first_part = text[:4999 - idx]
                first_part_trans = translate_en(first_part)
                second_part = text[len(first_part):]
                second_part_trans = translate_en(second_part)
                combined_text = first_part_trans + " " + second_part_trans
                print(combined_text)
                return combined_text
    return translate_en(text)

# PROBLEM: translation is limited TO 5000 characters!!!
df["translated_speech"] = df.apply(lambda row: translate_speech(row["speech"]), axis=1)
df

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_polarity_vader(input_speech):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(input_speech)
    return sentiment_dict["compound"]
    
df["polarity_vader"] = df.apply(lambda row: get_polarity_vader(row["translated_speech"][:4999]), axis=1)
df