# 1. Imports

In [157]:
# standard imports
import pandas as pd
import numpy as np
import pickle

#regex imports
import re
import string

#graph imports
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#scraping imports
import requests
from bs4 import BeautifulSoup
import re
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os

#NLP imports
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

#Cluster analysis imports
from sklearn.cluster import KMeans

# 2. Get URLs for XML files on Bundestag website

In [158]:
# run Chrome with Selenium
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
url = "https://www.bundestag.de/services/opendata"
driver = webdriver.Chrome(chromedriver)

driver.get(url)

In [159]:
def add_to_link_list ():
    '''finds all XML links and adds them to a list'''
    soup_links = BeautifulSoup(driver.page_source, 'html.parser')
    
    regex = re.compile('.*XML .*')
    documents = soup_links.find_all(title=regex)
    
    for i in documents:
        link_list.append(i['href'])

In [160]:
#click through all pages on Bundestag website with parliament transcripts for current term of office 
link_list = []

for i in range(21):
    button = driver.find_element_by_xpath("//button[@type='button'][@class='slick-next slick-arrow']")
    button.click()
    time.sleep(2)
    
add_to_link_list()

In [161]:
#create list with complete URLs

url_list = []
url_start = 'https://www.bundestag.de'

for i in link_list:
    url_list.append(url_start + i)

# 3. Scraping and parsing of XML files 

In [162]:
# create empty dictionary for data scraping
keys = ['speech_ID','date','speaker_ID','first_name','last_name','party','speech','comments']
data_dict = {}

for i in keys:
    data_dict[i] = []

In [163]:
#run scraping and parsing

fail_list = []

for url_ in url_list:
    try:
        url = url_
        user_agent = {'User-Agent' : 'Mozilla/5.0'}

        response = requests.get(url, headers=user_agent)
        
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        
        speeches = soup.find_all('rede')
        
        for i in speeches:
    
            #ID speech
            speech_ID = i['id']
        
            #date
            date = soup.find('datum').attrs['date']

            #ID speaker
            speaker_ID = i.find(klasse='redner').find('redner').attrs['id']

            #first name
            first_name = i.find(klasse='redner').find('redner').find('vorname').text

            #last name
            last_name = i.find(klasse='redner').find('redner').find('nachname').text

            #party/ role
            try:
                party = i.find(klasse='redner').find('redner').find('rolle_lang').text
            except:
                party = i.find(klasse='redner').find('redner').find('fraktion').text

            #speech, exclude first item in list as it is ID, name etc.
            speech_list = i.find_all('p')[1:]
            speech = ''
            for j in speech_list:
                speech += j.text

            #comments
            comments = []
            comments_list = i.find_all('kommentar')
            for i in comments_list:
                comments.append(i.text)

            #populate dictionary
            for i in keys:
                data_dict[i].append(eval(i))
    except:
        fail_list.append(i)

In [164]:
#create dataframe
df = pd.DataFrame(data_dict)

# 4. Data cleaning

## 4.1. Clean information on parliament member

### 4.1.a Preparation

In [165]:
#add full name column
df['full_name'] = df['last_name'] + ', ' + df['first_name']

In [166]:
#lists with parliament roles

CDU_secretaries = (['Merkel, Angela','Scheuer, Andreas','Klöckner, Julia','Altmaier, Peter','Karliczek, Anja',
                    'Spahn, Jens','Seehofer, Horst','Müller, Gerd','Leyen, Ursula','Kramp-Karrenbauer, Annegret'])

SPD_secretaries = (['Schulze, Svenja','Heil, Hubertus','Giffey, Franziska','Barley, Katarina','Lambrecht, Christine',
                    'Scholz, Olaf','Maas, Heiko'])


CDU_undersecretaries = (['Barthle, Norbert','Bär, Dorothee','Braun, Helge', 'Bareiß, Thomas','Bilger, Steffen',
                         'Ferlemann, Enak','Flachsbarth, Maria','Fuchtel, Hans-Joachim', 'Gebhart, Thomas',
                         'Grütters, Monika','Hirte, Christian','Hoppenstedt, Hendrik','Krings, Günter',
                         'Mayer, Stephan','Meister, Michael','Rachel, Thomas','Stübgen, Michael','Tauber, Peter',
                         'Wanderwitz, Marco','Weiss, Sabine','Widmann-Mauz, Annette','Wittke, Oliver'])

SPD_undersecretaries = (['Annen, Niels','Hagedorn, Bettina','Hagl-Kehl, Rita','Griese, Kerstin','Kramme, Anette',
                         'Lange, Christian','Lambrecht, Christine','Marks, Caren','Müntefering, Michelle',
                         'Pronold, Florian','Roth, Michael','Ryglewski, Sarah','Schwarzelühr-Sutter, Rita',
                         'Silberhorn, Thomas'])

CDU_other_roles = ['Brauksiepe, Ralf','Spahn, Jens','Schäuble, Dr. Wolfgang','Karliczek, Anja']

independent_members = (['Bülow, Marco', 'Mieruch, Mario', 'Petry, Frauke','Kamann, Uwe'])

secretary_list = CDU_secretaries + SPD_secretaries

undersecretary_list = CDU_undersecretaries + SPD_undersecretaries

cabinet_list = secretary_list + undersecretary_list

In [167]:
#create party dictionary
party_dict = {'CDU/CSU': 'CDU/CSU','AfD': 'AfD', 'SPD': 'SPD','FDP': 'FDP','DIE LINKE':'DIE LINKE',
               'BÜNDNIS 90/DIE GRÜNEN':'BÜNDNIS 90/DIE GRÜNEN'}

In [168]:
#dictionary with resorts

resort_dict = ({'Merkel, Angela':'Bundeskanzlerin','Scholz, Olaf':'Finanzen','Seehofer, Horst':'Inneres, Bau und Heimat',
                'Maas, Heiko':'Auswärtiges','Altmaier, Peter':'Wirtschaft und Energie','Barley, Katarina':'Justiz und Verbraucherschutz',
                'Lambrecht, Christine':'Justiz und Verbraucherschutz/ Finanzen','Heil, Hubertus':'Arbeit und Soziales',
                'Leyen, Ursula':'Verteidigung','Kramp-Karrenbauer, Annegret':'Verteidigung','Klöckner, Julia':'Ernährung und Landwirtschaft',
                'Giffey, Franziska':'Familie, Senioren, Frauen und Jugend','Spahn, Jens':'Gesundheit',
                'Scheuer, Andreas':'Verkehr und digitale Infrastruktur', 'Schulze, Svenja':'Umwelt, Naturschutz und nukleare Sicherheit',
                'Karliczek, Anja':'Bildung und Forschung','Müller, Gerd':'Wirtschaftliche Zusammenarbeit und Entwicklung',
                'Braun, Helge':'Bundeskanzleramt',
                'Grütters, Monika':'Kultur und Medien','Hoppenstedt, Hendrik':'Bürokratieabbau und bessere Rechtsetzung, Bund-Länder-Beziehungen',
                'Widmann-Mauz, Annette':'Migration, Flüchtlinge und Integration','Bär, Dorothee':'Digitalisierung',
                'Hagedorn, Bettina':'Finanzen','Ryglewski Sarah':'Finanzen','Krings, Günter':'Inneres, Bau und Heimat',
                'Wanderwitz, Marco':'Inneres, Bau und Heimat','Mayer, Stephan':'Inneres, Bau und Heimat',
                'Annen, Niels':'Auswärtiges','Müntefering, Michelle':'Auswärtiges - Internationale Kulturpolitik',
                'Roth, Michael':'Auswärtiges - Europa, Deutsch-französische Zusammenarbeit',
                'Bareiß, Thomas':'Wirtschaft und Energie - Tourismus','Hirte, Christian':'Wirtschaft und Energie - Neue Bundesländer',
                'Wittke, Oliver':'Wirtschaft und Energie - EITI','Hagl-Kehl, Rita':'Justiz und Verbraucherschutz',
                'Lange, Christian':'Justiz und Verbraucherschutz','Griese, Kerstin':'Arbeit und Soziales',
                'Kramme, Anette':'Arbeit und Soziales','Tauber, Peter':'Verteidigung','Silberhorn, Thomas':'Verteidigung',
                'Fuchtel, Hans-Joachim':'Ernährung und Landwirtschaft','Stübgen, Michael':'Ernährung und Landwirtschaft',
                'Marks, Caren':'Familie, Senioren, Frauen und Jugend','Zierke, Stefan':'Familie, Senioren, Frauen und Jugend',
                'Gebhart, Thomas':'Gesundheit','Weiss, Sabine':'Gesundheit','Bilger, Steffen':'Verkehr und digitale Infrastruktur',
                'Ferlemann, Enak':'Verkehr und digitale Infrastruktur','Pronold, Florian':'Umwelt, Naturschutz und nukleare Sicherheit',
                'Schwarzelühr-Sutter, Rita':'Umwelt, Naturschutz und nukleare Sicherheit','Meister, Michael':'Bildung und Forschung',
                'Rachel, Thomas':'Bildung und Forschung','Barthle, Norbert':'Wirtschaftliche Zusammenarbeit und Entwicklung',
                'Flachsbarth, Maria':'Wirtschaftliche Zusammenarbeit und Entwicklung'})

In [169]:
#dictionary with duplicate names
duplicates_dict = ({'Bluhm, Heidrun':'Bluhm-Förster, Heidrun','Cotar, Joana Eleonora':'Cotar, Joana',
                    'Gauland, Alexander':'Gauland, Eberhardt Alexander',
                    'Elsner von Gronow, Berengar':'Gronow, Berengar Elsner von',
                    'Kuhle, Konstantin':'Kuhle, Konstantin Elias',
                    'Schreiber, Eva-Maria':'Schreiber, Eva-Maria Elisabeth','Weiler, Albert':'Weiler, Albert H.'})

### 4.1.b Create functions

In [170]:
def get_party_1(row):
    '''adjusts party for parliament members with official positions or without party membership'''
    
    #CDU/CSU
    if row['full_name'] in CDU_secretaries:
        return 'CDU/CSU'
    elif row['full_name'] in CDU_undersecretaries:
        return 'CDU/CSU'
    elif row['full_name'] in CDU_other_roles:
        return 'CDU/CSU'
    
    #SPD
    elif row['full_name']  in SPD_secretaries:
        return 'SPD'    
    elif row['full_name']  in SPD_undersecretaries:
        return 'SPD'
    
    #independent
    elif row['full_name']  in independent_members:
        return 'Fraktionslos'
    
    #all other
    else:
        return row['party']

In [171]:
def get_party_2(string):
    '''identifies guest speakers who are not parliament members as other'''
    try:
        party = party_dict[string]
    except:
        party = 'Other'
    return party

In [172]:
def get_cabinet(name):
    '''get role in parliament'''
    
    if name in cabinet_list:
        return 'Minister'
    elif name in secretary_list:
        return 'Staatssekretär/-minister'
    else:
        return 'Parlament'

In [173]:
def get_resort(name):
    '''add resort information'''
    
    if name in list(resort_dict.keys()):
        return resort_dict[name]
    else:
        return 'n/a'

In [174]:
def adjust_duplicates(string):
    '''adjusts duplicate names'''
    if string in list(duplicates_dict.keys()):
        return duplicates_dict[string]
    else:
        return string

### 4.1.c Apply functions

In [175]:
df['party'] = df.apply(get_party_1, axis=1)
df['party'] = df['party'].apply(get_party_2)
df['cabinet'] = df['full_name'].apply(get_cabinet)
df['resort'] = df['full_name'].apply(get_resort)
df['full_name'] = df['full_name'].apply(adjust_duplicates)

## 4.2. Clean speeches

In [176]:
# Create baseline 
df['speech_clean'] = df['speech']

#Remove numbers
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
df['speech_clean'] = df['speech_clean'].apply(alphanumeric)

#Remove punctuation
punctuation = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
df['speech_clean'] = df['speech_clean'].apply(punctuation)

#Remove non-breaking space
xa0 = lambda x: re.sub('\xa0', ' ', x)
xad = lambda x: re.sub('\xad', ' ', x)
df['speech_clean'] = df['speech_clean'].apply(xa0).apply(xad)

#Remove dash
dash = lambda x: re.sub('–', ' ', x)
df['speech_clean'] = df['speech_clean'].apply(dash)

#Make lower case
lower = lambda x: x.lower()
df['speech_clean'] = df['speech_clean'].apply(lower)

#Replace hyphens
hyphen_1 = lambda x: re.sub('“', ' ', x)
hyphen_2 = lambda x: re.sub('„', ' ', x)
df['speech_clean'] = df['speech_clean'].apply(hyphen_1).apply(hyphen_2)

#Remove doubles space
spaces = lambda x: ' '.join(x.split())
df['speech_clean'] = df['speech_clean'].apply(spaces)

## 4.3. Remove speeches with less than 50 words

In [177]:
def word_count(string):
    '''counts number of words in speech'''
    count = len(string.split())
    return count

In [178]:
df['word_count'] = df['speech_clean'].apply(word_count)

In [179]:
df = df[df['word_count']>49]

# 5. Word tokenization and vectorization

## 5.1. Tokenization and vectorization preparation

In [180]:
#load stopwords
stop_words = stopwords.words('german')

# add manual stopwords
manual_stop_words = (['werd','herr','kolleg','mehr','woll','wer', 'gut', 'wichtig','uber','konn','sag','frag', 'sag',
                      'frau', 'herr''schon', 'wurd', 'gibt', 'thema', 'ganz', 'mal', 'konn', 'glaub', 'gesagt', 'mach',
                      'geht', 'stell', 'all', 'and', 'red', 'hatt', 'debatt','mocht', 'dank', 'word', 'lieb', 'letzt',
                      'find', 'darub', 'darauf', 'desweg','eigent', 'vielleicht', 'genau', 'gar','bundesregier','deutsch'])

stop_words = stop_words + manual_stop_words

#load stemmer
stemmer = SnowballStemmer('german')

In [181]:
#create tokenizer and stemmer function
def tokenize_and_stem(text):
    '''applies tokenization and stemming'''
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        token = stemmer.stem(token)
        if len(token) > 2:
            filtered_tokens.append(token)
    return filtered_tokens

## 5.2. Tfidf Vectorizer

In [182]:
tfidf = TfidfVectorizer(stop_words=stop_words,tokenizer=tokenize_and_stem,max_df=0.9)
sparse_tfidf = tfidf.fit_transform(df['speech_clean'])
tfidf_feature_names = tfidf.get_feature_names()

  'stop_words.' % sorted(inconsistent))


# 6. Topic modeling - NMF (35)

## 6.1. Topic modeling preparation

In [183]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    '''shows topics identified in topic modeling'''
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## 6.2. Topic modeling

In [184]:
nmf_35 = NMF(35,random_state=1)
nmf_35_topic = nmf_35.fit_transform(sparse_tfidf)

## 6.3. Topic identification

In [185]:
topic_names = (['Füllwörter','Europa/EU-Politik','Bundeswehr','Haushalt','Familienpolitik','Parlamentsabstimmung',
                   'Gesetzgebung','Bildungspolitik','Arbeitsmarkt','Miete/Wohnen','Umwelt/Energie','Sudaneinsatz',
                   'NATO/Sicherheitspolitik','Pflegepolitik','Studium','Parlamentsdebatten','Organspende',
                   'Automobilverkehr','Ländlicher Raum','Wirtschaft','Förderalismus','Grundgesetz/Demokratie',
                   'Syrienkonflikt/Naher Osten','Kosovoeinsatz','Nahostkonflikt','Innenpolitik/Rechtstaat',
                   'Flüchtlingspolitik','Seenotrettung','Gesundheitspolitik','Afghanistaneinsatz','Brexit',
                   'Digitaliserung','Finanzmarktpolitik','Malieinsatz','Landwirtschaft'])

In [186]:
#display identified topics
display_topics(nmf_35,tfidf_feature_names,20,topic_names = topic_names)


Topic: ' Füllwörter '
haushalt, geld, minist, bereich, investition, neu, mittel, koalition, zukunft, jahr, gross, schon, einzelplan, dafur, richtig, scholz, projekt, etat, beim, aufgab

Topic: ' Europa/EU-Politik '
europa, union, gemeinsam, national, europas, mitgliedstaat, stark, brauch, deutschland, kommission, bank, staat, eben, interess, zukunft, sozial, aussenpolit, zusammenarbeit, fried, griechenland

Topic: ' Bundeswehr '
gesetzentwurf, gesetz, regel, verfahr, moglich, richtlini, berat, weit, schon, vorlieg, bundesverfassungsgericht, fall, anhor, entwurf, punkt, entsprech, betroff, erst, heut, jahr

Topic: ' Haushalt '
einsatz, mandat, mali, soldat, mission, afghanistan, soldatinn, irak, malisch, region, operation, guardian, sea, kosovo, militar, mittelme, ausbild, atalanta, engagement, minusma

Topic: ' Familienpolitik '
kind, famili, elt, kinderzuschlag, kindergeld, kinderarmut, familienpolit, alleinerzieh, leb, leistung, armut, gesellschaft, mutt, gesetz, jugend, elterngeld,

In [187]:
#create dataframe with topic values
H_35 = pd.DataFrame(nmf_35_topic.round(5), columns = topic_names)

#merge dataframes
df = pd.concat([df.reset_index(drop=True), H_35.reset_index(drop=True)], axis=1)

# 7. Aggregation of speeches py parliament member

## 7.1. Aggregation preparation

In [188]:
# get sum of topics per speech
df['topic_sum']= 0
for topic in topic_names:
    df['topic_sum'] += df[topic]
    
#get percentage share of topic per speech
aggregation_columns = topic_names + ['topic_sum']

for topic in aggregation_columns:
    df[topic] = df[topic]/df['topic_sum']

In [189]:
#remove speeches of non-parliament members
df_parties = df[df['party']!='Other']

## 7.2. Speech aggregation

In [190]:
#sum word count and topic columns for each parliament member
sum_columns = ['word_count'] + aggregation_columns

df_speaker_parties = df_parties.groupby(['full_name','party'])[sum_columns].apply(lambda x: x.sum()).reset_index()

#get percentage share of topic per parliament members
for topic in topic_names:
    df_speaker_parties[topic] = df_speaker_parties[topic]/df_speaker_parties['topic_sum']
    
df_speaker_parties.reset_index(inplace=True, drop=True)

# 8. Cluster analysis (k=15)

In [191]:
#run cluster analysis
km_15 = KMeans(n_clusters=15,random_state=44)

X_speaker = df_speaker_parties[topic_names].values
km_15.fit(X_speaker)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=15, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=44, tol=0.0001, verbose=0)

In [192]:
#explore cluster centers
df_clusters_15 = pd.DataFrame(np.round(km_15.cluster_centers_,decimals=5),columns=topic_names)
df_clusters_15

for i in range(0,15):
    cluster_values = df_clusters_15.iloc[i,:].values
    top_3_idx = np.argsort(cluster_values)[::-1][:8]
    top_3 = []
    print ('Cluster {}'.format(i))
    for i in top_3_idx:
        print('{}: {}%'.format(topic_names[i],round(cluster_values[i]*100,0)))
    print ('\n')

Cluster 0
Seenotrettung: 9.0%
Innenpolitik/Rechtstaat: 8.0%
Füllwörter: 6.0%
Studium: 5.0%
Parlamentsdebatten: 5.0%
Gesundheitspolitik: 5.0%
Bildungspolitik: 5.0%
Familienpolitik: 3.0%


Cluster 1
Finanzmarktpolitik: 32.0%
Brexit: 7.0%
Seenotrettung: 6.0%
Bundeswehr: 5.0%
Innenpolitik/Rechtstaat: 5.0%
Umwelt/Energie: 4.0%
Parlamentsdebatten: 4.0%
Wirtschaft: 4.0%


Cluster 2
Seenotrettung: 29.0%
Bundeswehr: 6.0%
Parlamentsdebatten: 5.0%
Syrienkonflikt/Naher Osten: 5.0%
Gesundheitspolitik: 4.0%
Innenpolitik/Rechtstaat: 3.0%
Parlamentsabstimmung: 3.0%
Europa/EU-Politik: 3.0%


Cluster 3
Umwelt/Energie: 22.0%
Seenotrettung: 8.0%
Finanzmarktpolitik: 6.0%
Parlamentsdebatten: 6.0%
Bundeswehr: 5.0%
Ländlicher Raum: 5.0%
Europa/EU-Politik: 5.0%
Wirtschaft: 4.0%


Cluster 4
Familienpolitik: 30.0%
Seenotrettung: 7.0%
Bundeswehr: 6.0%
Parlamentsdebatten: 5.0%
Innenpolitik/Rechtstaat: 5.0%
Bildungspolitik: 4.0%
Arbeitsmarkt: 4.0%
Parlamentsabstimmung: 3.0%


Cluster 5
NATO/Sicherheitspolitik: 10.0

In [193]:
#name cluster based on topic distribution in centers
cluster_dict = {0:'Sicherheitspolitik', 1:'Umwelt-/Energiepolitik',2:'Europapolitik',3:'Gesundheitspolitik',
                4:'Arbeitsmarktpolitik',5:'Wirtschafts-/Finanzpolitik', 6:'Digitalisierung',7:'Demokratie/Rechtstaatlichkeit',
                8:'Innen-/Justizpolitik',9:'Verkehr/Infrastruktur',10:'Familienpolitik', 11:'Außenpolitik',12:'Pflegepolitik',
                13:'Bildungspolitik',14:'Landwirtschaftspolitik'}

In [194]:
#add clusters to dataframe
df_speaker_parties['cluster'] = km_15.labels_
df_speaker_parties['cluster_name'] = df_speaker_parties['cluster'].map(cluster_dict)