# Interjection Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import gender_classification as gc

In [2]:
# load data
df = pd.read_pickle('C:/Users/Ana/OneDrive - Hochschule Düsseldorf/MA/data_topics.pkl')

In [3]:
# add year column
df['year'] = df['date'].dt.year

In [4]:
df.head()

Unnamed: 0,text,date,legislative_period,speaker_name,speaker_gender,speaker_role,speaker_party,comments,id,speaker_id,speech_length,topic_distribution,year
0,Sehr geehrter Herr Alterspräsident! So muss ...,2021-10-26,20,Gabriele Katzmarek,female,,SPD,[(Beifall bei der SPD sowie bei Abgeordneten d...,SP-2021-0,1299,2593,{68: 0.35882655},2021
1,Herr Präsident! Liebe Kolleginnen und Kolleg...,2021-10-26,20,Stefan Müller,male,,CDU/CSU,[(Beifall bei der CDU/CSU sowie bei Abgeordnet...,SP-2021-1,4028,3796,{68: 0.46776655},2021
2,Sehr geehrter Herr Alterspräsident Wolfgang ...,2021-10-26,20,Britta Haßelmann,female,,BÜNDNIS 90/DIE GRÜNEN,[(Beifall beim BÜNDNIS 90/DIE GRÜNEN und bei d...,SP-2021-2,451,4301,{68: 0.26328164},2021
3,Sehr geehrter Herr Präsident! Meine lieben K...,2021-10-26,20,Marco Buschmann,male,,FDP,"[(Jan Korte [DIE LINKE]: Oder Jugendweihe!), (...",SP-2021-3,3083,4555,{},2021
4,Herr Alterspräsident! Lassen Sie mich zunäch...,2021-10-26,20,Stephan Brandner,male,,AfD,"[(Beifall bei der AfD), (Beifall bei Abgeordne...",SP-2021-4,4055,5996,{68: 0.3120498},2021


In [5]:
# filter dataframe for speeches with comments
df_comments = df[df['comments'].apply(lambda x: len(x) > 0)]

In [213]:
# number of speeches with comments
len(df_comments)

371023

## Matching and counting comments

In [195]:
# function for cleaning and classifying speaker names
PRE_1 = re.compile(r'(((Weiterer )?Gegenruf (?P<article>des|der) Abg\.))(?P<name>[\w,\s\.]*)')
PRE_2 = re.compile(r'((Zuruf (?P<article>des|der) Abg\.)|(Zuruf von))(?P<name>[\w,\s\.]*)')
PRE_3 = re.compile(r'(.*?)\s*--\s*(?P<name>[\w\s\.-]+)$')

PREAMBLES = [PRE_1, PRE_2, PRE_3]

def get_speaker_gender(name):

    for PREAMBLE in PREAMBLES:
        match = re.search(PREAMBLE, name)
        if(match):
            name = match.groupdict()["name"]
            if 'article' in match.groupdict():
                article = match.groupdict()["article"]
                #print(article)
                return 'female' if article == 'der' else 'male'
    
    name = name.strip()
    name_length = len(name.split())

    if name_length > 1:
        #print("größer als 1")

        # remove 'Abgeordnete(r)' from name
        name = re.sub('Abg\.?', '', name).strip()

        # if name 'Frau X', classify as female
        if name.split()[0] == 'Frau':
            #print("Frau Bla")
            return 'female'
        
        # if first substring is title, take second substring or return unknown
        if "." in name.split()[0]:
            if name_length > 2 :
                #print("Dr. Bla Bli")
                first_name = name.split()[1]
            else:
                #print("Dr. Bli", name)
                return 'unkown'
        else:
            first_name = name.split()[0]
        #print(first_name)
        return gc.get_gender(first_name)
    
    # return unknown if only last name is given
    else:
        #print(name)
        return 'unknown'
    
    

In [208]:
# test classification
get_speaker_gender("Beifall bei der SPD. -- Abg. Dr. Marx")

'unknown'

In [175]:
# dict of faulty characters and names to replace
chars_to_replace = {
    '\xa0': ' ',
    '…': '...',
    '\u202f': '',
    '„': '"',
    '”': '"',
    '“': '"',
    '§': 'Paragraph',
    '\n                   ': '',
    'Prinz zu Sayn-Wittgenstein-Hohenstein': 'Botho Prinz zu Sayn-Wittgenstein-Hohenstein',
    'Dr.Kohl': 'Dr. Kohl',
    'Dr. -Ing.': 'Dr.-Ing.'
}

### Counting comments and identifying speakers and their gender

In [202]:
# recognizes comments of named speakers
RE_SPEAKER_DIRECTLY = re.compile(r'(?P<speaker>[\w]+[\w\s\.-]*) \[(?P<party>[\w,/\s]*)\]: (?P<comment>[\w,!\s\.\?\"]*)( - )*')

RES = [RE_SPEAKER_DIRECTLY]

comment_counts = {}
comment_speakers = {}
comment_fem_speakers = {}
comment_male_speakers = {}

f = open('reference files/comments_per_speech.txt', 'w', encoding='utf-8')

for index, speech in df_comments.iterrows():
    comment_count = 0
    speakers = []
    fem_speakers = 0
    male_speakers = 0

    for comment in speech['comments']:
        for key, value in chars_to_replace.items():
            comment = comment.replace(key, value)
        for RE in RES:
            matches = RE.finditer(comment)
            for match in matches:
                speaker = match.groupdict()['speaker']
                print(f"{index}: {match.groupdict()}", file=f)
                comment_count += 1

                if speaker != "":
                    speakers.append(speaker)

    for speaker in speakers:
        if get_speaker_gender(speaker) == 'female':
            fem_speakers += 1
        if get_speaker_gender(speaker) == 'male':
            male_speakers += 1

    comment_counts[index] = comment_count
    comment_speakers[index] = speakers
    comment_fem_speakers[index] = fem_speakers
    comment_male_speakers[index] = male_speakers

f.close()

In [207]:
len(comment_counts)

371023

In [164]:
# get comment count by speech id
comment_counts[14]

4

In [None]:
# add columns for comment- and speaker gender count
df_comments['comment_count'] = comment_counts.values()
df_comments['comment_fem_speakers'] = comment_fem_speakers.values()
df_comments['comment_male_speakers'] = comment_male_speakers.values()

In [217]:
df_comments.head()

Unnamed: 0,text,date,legislative_period,speaker_name,speaker_gender,speaker_role,speaker_party,comments,id,speaker_id,speech_length,topic_distribution,year,comment_count,comment_fem_speakers,comment_male_speakers
0,Sehr geehrter Herr Alterspräsident! So muss ...,2021-10-26,20,Gabriele Katzmarek,female,,SPD,[(Beifall bei der SPD sowie bei Abgeordneten d...,SP-2021-0,1299,2593,{68: 0.35882655},2021,0,0,0
1,Herr Präsident! Liebe Kolleginnen und Kolleg...,2021-10-26,20,Stefan Müller,male,,CDU/CSU,[(Beifall bei der CDU/CSU sowie bei Abgeordnet...,SP-2021-1,4028,3796,{68: 0.46776655},2021,4,3,1
2,Sehr geehrter Herr Alterspräsident Wolfgang ...,2021-10-26,20,Britta Haßelmann,female,,BÜNDNIS 90/DIE GRÜNEN,[(Beifall beim BÜNDNIS 90/DIE GRÜNEN und bei d...,SP-2021-2,451,4301,{68: 0.26328164},2021,3,1,2
3,Sehr geehrter Herr Präsident! Meine lieben K...,2021-10-26,20,Marco Buschmann,male,,FDP,"[(Jan Korte [DIE LINKE]: Oder Jugendweihe!), (...",SP-2021-3,3083,4555,{},2021,1,0,1
4,Herr Alterspräsident! Lassen Sie mich zunäch...,2021-10-26,20,Stephan Brandner,male,,AfD,"[(Beifall bei der AfD), (Beifall bei Abgeordne...",SP-2021-4,4055,5996,{68: 0.3120498},2021,4,3,1


In [218]:
# test comment count per speech
RE_SPEAKER_DIRECTLY = re.compile(r'(?P<speaker>[\w]+[\w\s\.-]*) \[(?P<party>[\w,/\s]*)\]: (?P<comment>[\w,!\s\.\?\"]*)( - )*')

RES = [RE_SPEAKER_DIRECTLY]

for comment in df_comments["comments"][15]: # insert speech id here
    for RE in RES:
        matches = RE.finditer(comment)
        for match in matches:
            if match.groupdict()["speaker"] == "":
                print("no name")
            else:
                print(match.groupdict())

{'speaker': 'Gabriele Katzmarek', 'party': 'SPD', 'comment': 'Herr Kollege, jetzt wird es aber komisch!'}
{'speaker': 'Andrea Lindholz', 'party': 'CDU/CSU', 'comment': 'Absolut!'}
{'speaker': 'Andrea Lindholz', 'party': 'CDU/CSU', 'comment': 'Ja, genau!'}
{'speaker': 'Andrea Lindholz', 'party': 'CDU/CSU', 'comment': 'So ist es!\xa0'}
{'speaker': 'Dr.\xa0Marco Buschmann', 'party': 'FDP', 'comment': 'Ihr lehnt doch auch unseren kleinen Wunsch bei der Sitzordnung ab!'}
{'speaker': 'Dr.\xa0Marco Buschmann', 'party': 'FDP', 'comment': 'Ja, das ist doch genau das, was du thematisierst!'}
{'speaker': 'Dr.\xa0Marco Buschmann', 'party': 'FDP', 'comment': 'Ach, viel hilft viel? Deswegen habt ihr das Wahlrecht so schlecht geändert!'}
{'speaker': 'Jan Korte', 'party': 'DIE LINKE', 'comment': 'Na ja!'}


## Calculating Means per Gender

In [220]:
# calculate average comment count per speech by gender
average_comments_by_gender = df_comments.groupby('speaker_gender')['comment_count'].mean()
average_comments_by_gender

speaker_gender
female    1.626886
male      1.618065
Name: comment_count, dtype: float64

In [221]:
# calculate average count of female commentators per speech by gender
female_commentators_by_gender = df_comments.groupby('speaker_gender')['comment_fem_speakers'].mean()
female_commentators_by_gender

speaker_gender
female    0.356641
male      0.340016
Name: comment_fem_speakers, dtype: float64

In [222]:
# calculate average count of male commentators per speech by gender
male_commentators_by_gender = df_comments.groupby('speaker_gender')['comment_male_speakers'].mean()
male_commentators_by_gender

speaker_gender
female    0.966006
male      0.797386
Name: comment_male_speakers, dtype: float64