In [109]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import os
import unicodedata, re
import pandas as pd

# global vars
protocol_dir= './data/bundestagsprotokolle'
parties = ['GRÜNE', 'LINKE', 'FDP', 'CDU/CSU', 'SPD', 'AfD']
interaction_types = {'applause': 'Beifall', 'shout': 'Zurufe'}

In [31]:
def download_parlament_protocols(protocol_dir: str):
    os.makedirs(protocol_dir, exist_ok=True)
    root_url = "https://www.bundestag.de"
    url = "https://www.bundestag.de/ajax/filterlist/de/services/opendata/866354-866354?offset="
    
    for offset in range(0,500,10):
        print(f"Current offset {offset}")
        protocol_list = requests.get(url + f"{offset}").text
        soup = BeautifulSoup(protocol_list, 'html.parser')

        tags = soup.find_all('a')
        if not tags:
            print(f'Break at offset {offset}')
            break

        for link in tags:
            if link.get('href', '').endswith('.xml'):
                xml_protocol = requests.get(root_url + link.get('href')).text
                xml_soup = BeautifulSoup(xml_protocol, 'lxml') # to extract protocol date
                german_date = datetime.strptime(xml_soup.find("datum").get("date"), '%d.%m.%Y')
                open(os.path.join(protocol_dir, f'{german_date.strftime("%Y-%m-%d")}-protocol.xml'), 'w', encoding='utf-8').write(xml_protocol)

In [106]:
def nrmlz(s):
    return unicodedata.normalize("NFKD",s)

def get_party_interactions(speech) -> dict:
    interactions = {f"{interaction_type}_{party}":0 for interaction_type in interaction_types for party in parties}
    for comment in speech.find_all('kommentar'):
        for interaction_type in interaction_types:
            if interaction_types[interaction_type] in comment.text:
                interaction_per_party = {party : len(re.findall(party, comment.text)) for party in parties}
                
                for party in parties:
                        interactions[f"{interaction_type}_" + party] += interaction_per_party[party]
                    
    return interactions

def load_all_speeches() -> list:
    speeches = []
    for protocol in os.listdir(protocol_dir):
        print(f"Current protcol: {protocol}")
        soup = BeautifulSoup(open(f'{protocol_dir}/{protocol}', 'r', encoding='utf-8').read(), 'xml')
        rede = soup.find('rede')
        date = datetime.strptime(protocol.split('-protocol.xml')[0], '%Y-%m-%d')
        for speech in soup.find_all('rede'):
            speech_text = nrmlz(" ".join([x.getText() for x in rede.find_all('p', {'klasse':'J'})])) 
            speaker = speech.find('redner')
            speeches.append({
                'speaker': f"{speaker.find('vorname').text} {speaker.find('nachname').text}",
                'party': speaker.find('fraktion').text if speaker.find('fraktion') else speaker.find("rolle").text if speaker.find("rolle") else None,
                'speech': speech_text,
                'date': date,
                **get_party_interactions(speech)
            })
            
    return speeches

In [107]:
if not os.path.exists(protocol_dir) or len(os.listdir(protocol_dir)) == 0:
    download_parlament_protocols(protocol_dir)
    
if os.path.exists(protocol_dir) and len(os.listdir(protocol_dir)) > 0:
    speeches = load_all_speeches()

Current protcol: 2021-10-26-protocol.xml
Current protcol: 2021-11-11-protocol.xml
Current protcol: 2021-11-18-protocol.xml
Current protcol: 2021-12-07-protocol.xml
Current protcol: 2021-12-08-protocol.xml
Current protcol: 2021-12-09-protocol.xml
Current protcol: 2021-12-10-protocol.xml
Current protcol: 2021-12-15-protocol.xml
Current protcol: 2021-12-16-protocol.xml
Current protcol: 2022-01-12-protocol.xml
Current protcol: 2022-01-13-protocol.xml
Current protcol: 2022-01-14-protocol.xml
Current protcol: 2022-01-26-protocol.xml
Current protcol: 2022-01-27-protocol.xml
Current protcol: 2022-01-28-protocol.xml
Current protcol: 2022-02-16-protocol.xml
Current protcol: 2022-02-17-protocol.xml
Current protcol: 2022-02-18-protocol.xml
Current protcol: 2022-02-27-protocol.xml
Current protcol: 2022-03-16-protocol.xml
Current protcol: 2022-03-17-protocol.xml
Current protcol: 2022-03-18-protocol.xml
Current protcol: 2022-03-22-protocol.xml
Current protcol: 2022-03-23-protocol.xml
Current protcol:

In [112]:
df = pd.DataFrame(speeches)
df.columns = df.columns.str.lower()
df.to_csv('./data/all_bundestagsprotokolle.csv', index=False)

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22214 entries, 0 to 22213
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   speaker           22214 non-null  object        
 1   party             22213 non-null  object        
 2   speech            22214 non-null  object        
 3   date              22214 non-null  datetime64[ns]
 4   applause_grüne    22214 non-null  int64         
 5   applause_linke    22214 non-null  int64         
 6   applause_fdp      22214 non-null  int64         
 7   applause_cdu/csu  22214 non-null  int64         
 8   applause_spd      22214 non-null  int64         
 9   applause_afd      22214 non-null  int64         
 10  shout_grüne       22214 non-null  int64         
 11  shout_linke       22214 non-null  int64         
 12  shout_fdp         22214 non-null  int64         
 13  shout_cdu/csu     22214 non-null  int64         
 14  shout_spd         2221