# Interjection Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import gender_classification as gc
from collections import Counter

In [2]:
# load data
df = pd.read_pickle('C:/Users/Ana/OneDrive - Hochschule Düsseldorf/MA/data_topics.pkl')

In [3]:
# add year column
df['year'] = df['date'].dt.year

In [4]:
df.head()

Unnamed: 0,text,date,legislative_period,speaker_name,speaker_gender,speaker_role,speaker_party,comments,id,speaker_id,speech_length,topic_distribution,year
0,Sehr geehrter Herr Alterspräsident! So muss ...,2021-10-26,20,Gabriele Katzmarek,female,,SPD,[(Beifall bei der SPD sowie bei Abgeordneten d...,SP-2021-0,1299,2593,{68: 0.35882655},2021
1,Herr Präsident! Liebe Kolleginnen und Kolleg...,2021-10-26,20,Stefan Müller,male,,CDU/CSU,[(Beifall bei der CDU/CSU sowie bei Abgeordnet...,SP-2021-1,4028,3796,{68: 0.46776655},2021
2,Sehr geehrter Herr Alterspräsident Wolfgang ...,2021-10-26,20,Britta Haßelmann,female,,BÜNDNIS 90/DIE GRÜNEN,[(Beifall beim BÜNDNIS 90/DIE GRÜNEN und bei d...,SP-2021-2,451,4301,{68: 0.26328164},2021
3,Sehr geehrter Herr Präsident! Meine lieben K...,2021-10-26,20,Marco Buschmann,male,,FDP,"[(Jan Korte [DIE LINKE]: Oder Jugendweihe!), (...",SP-2021-3,3083,4555,{},2021
4,Herr Alterspräsident! Lassen Sie mich zunäch...,2021-10-26,20,Stephan Brandner,male,,AfD,"[(Beifall bei der AfD), (Beifall bei Abgeordne...",SP-2021-4,4055,5996,{68: 0.3120498},2021


In [5]:
# filter dataframe for speeches with comments
df_comments = df[df['comments'].apply(lambda x: len(x) > 0)]

In [6]:
# number of speeches with comments
len(df_comments)

371023

## Matching and counting comments

### Classifying speaker gender

In [7]:
# function for cleaning and classifying speaker names
PRE_1 = re.compile(r'(((Weiterer )?Gegenruf (?P<article>des|der) Abg\.))(?P<name>[\w,\s\.]*)')
PRE_2 = re.compile(r'(((Zuruf|Zustimmung) (?P<article>des|der) Abg\.)|(Zuruf von))(?P<name>[\w,\s\.]*)')
PRE_3 = re.compile(r'(.*?)\s*--\s*(?P<name>[\w\s\.-]+)$')

PREAMBLES = [PRE_1, PRE_2, PRE_3]

def get_speaker_gender(name):

    for PREAMBLE in PREAMBLES:
        match = re.search(PREAMBLE, name)
        if(match):
            name = match.groupdict()["name"]
            if 'article' in match.groupdict():
                article = match.groupdict()["article"]
                #print(article)
                return 'female' if article == 'der' else 'male'
    
    name = name.strip()
    name_length = len(name.split())

    if name_length > 1:
        #print("größer als 1")

        # remove 'Abgeordnete(r)' from name
        name = re.sub('Abg\.?', '', name).strip()

        # if name 'Frau X', classify as female
        if name.split()[0] == 'Frau':
            #print("Frau Bla")
            return 'female'
        
        # if first substring is title, take second substring or return unknown
        if "." in name.split()[0]:
            if name_length > 2 :
                #print("Dr. Bla Bli")
                first_name = name.split()[1]
            else:
                #print("Dr. Bli", name)
                return 'unkown'
        else:
            first_name = name.split()[0]
        #print(first_name)
        return gc.get_gender(first_name)
    
    # return unknown if only last name is given
    else:
        #print(name)
        return 'unknown'
    
    

In [8]:
# test classification
get_speaker_gender("Zustimmung der Abg. Frau Kalinke.")

'female'

### Cleaning comments

In [9]:
# dict of faulty characters and names to replace
chars_to_replace = {
    '\xa0': ' ',
    '…': '...',
    '\u202f': '',
    '„': '"',
    '”': '"',
    '“': '"',
    '§': 'Paragraph',
    '\n                   ': '',
    'Prinz zu Sayn-Wittgenstein-Hohenstein': 'Botho Prinz zu Sayn-Wittgenstein-Hohenstein',
    'Dr.Kohl': 'Dr. Kohl',
    'Dr. -Ing.': 'Dr.-Ing.',
}

In [10]:
# function for replacing in-comment hyphens (that are not used for separation between comments)
def replace_hyphens(string):
    hyphen = re.compile(r'[\w,!\s\.\?\":]*(?P<hyphen>-)[\w,!\s\.\?\":]*')

    matches = hyphen.finditer(string)
    hyphens_pos = []

    for match in matches:
        if 'hyphen' in match.groupdict():
            pos = match.start(1)
            hyphens_pos.append(pos)

    cleaned_str = string 
    for pos in hyphens_pos:
        cleaned_str = cleaned_str[: pos] + ' ' + cleaned_str[pos + 1:]

    return cleaned_str

In [11]:
# test hyphen replacement
test_string = "(Rawe [CDU/CSU] : Er hat den-Kanzler gemeint! — Seiters [CDU/CSU]: Meinen Sie die Dr.-Ing SPD-Flügel? — Franke [Osnabrück] [CDU/CSU]: Meinen Sie Herrn-Bangemann? — Weitere Zurufe von der CDU/CSU)"

replace_hyphens(test_string)

'(Rawe [CDU/CSU] : Er hat den Kanzler gemeint! — Seiters [CDU/CSU]: Meinen Sie die Dr. Ing SPD Flügel? — Franke [Osnabrück] [CDU/CSU]: Meinen Sie Herrn Bangemann? — Weitere Zurufe von der CDU/CSU)'

In [12]:
# function for cleaning unmatched interjections in multiple-comment-string
to_remove = [' ', '']
separators = r'—|–'

def clean_rest_comments(rest_string):
    rest_string = re.sub('[()]', '', rest_string)
    comments = re.split(separators, rest_string)
    comments = [c for c in comments if c not in to_remove]

    for index, comment in enumerate(comments):
        comments[index] = comment.strip()

    return comments

### Classifying reactions

In [91]:
# function for classifying reaction interjections
def check_reaction(interjection):
    cheerful_reactions = ['Heiterkeit', 'Heiterkeit und Beifall', 'Beifall und Heiterkeit']
    disgust_pattern = re.compile(r'(Pfui Rufe?)|(Pfui)')

    for reaction in cheerful_reactions:
        if reaction in interjection:
            return 'cheerfulness'
    if 'Beifall' in interjection:
        return 'applause'
    if 'Unruhe' in interjection:
        return 'disquiet'
    if 'Lachen' in interjection:
        return 'laughter'
    if 'Zustimmung' in interjection:
        return 'approval'
    if disgust_pattern.search(interjection) is not None:
        return 'disgust'
    else:
        return

In [193]:
all_reactions = ['applause', 'cheerfulness', 'laughter', 'approval', 'disgust', 'disquiet']

In [14]:
def get_reactions(comment):
    reactions = []
    unmatched_comments = clean_rest_comments(comment)
    
    for unmatched_comment in unmatched_comments:
        reaction = check_reaction(unmatched_comment)
        if(reaction): 
            reactions.append(reaction) 

    return reactions

### Counting comment types and identifying speakers and their gender

In [92]:
# recognizes comments of named speakers
RE_SPEAKER_DIRECTLY = re.compile(r'(?P<speaker>[\w]+[\w\s\.-]*) \[(?P<party>[\w,/\s]*)\]: (?P<comment>[\w,!\s\.\?\":]*)( - )*')

RES = [RE_SPEAKER_DIRECTLY]

comment_counts = {}
comment_speakers = {}
comment_fem_speakers = {}
comment_male_speakers = {}
reaction_counts = {}

#f = open('reference files/matches.txt', 'w', encoding='utf-8')

for index, speech in df_comments.iterrows():
    comment_count = 0
    speakers = []
    fem_speakers = 0
    male_speakers = 0
    reactions = []

    for comment in speech['comments']:
        # clean comments
        for key, value in chars_to_replace.items():
            comment = comment.replace(key, value)
        comment = replace_hyphens(comment)

        # search for regex
        for RE in RES:
            rest_str = comment

            if re.search(RE, comment) is not None:
                matches = RE.finditer(comment)

                for match in matches:
                    if 'speaker' in match.groupdict():
                        speaker = match.groupdict()['speaker']
                        #print(f"Speech {index}: {match.groupdict()}", file=f)
                        comment_count += 1

                        if speaker != "":
                            speakers.append(speaker)
                    
                    # get unmatched comments
                    rest_str = rest_str.replace(match.group(), '')
                    reactions_in_comment = get_reactions(rest_str)
                    reactions.extend(reactions_in_comment)
            
            else:
                reactions_in_comment = get_reactions(comment)
                reactions.extend(reactions_in_comment)

    # get speaker gender
    for speaker in speakers:
        if get_speaker_gender(speaker) == 'female':
            fem_speakers += 1
        if get_speaker_gender(speaker) == 'male':
            male_speakers += 1

    comment_counts[index] = comment_count
    comment_speakers[index] = speakers
    comment_fem_speakers[index] = fem_speakers
    comment_male_speakers[index] = male_speakers
    reaction_counts[index] = Counter(reactions)

#f.close()

In [93]:
reaction_counts

{0: Counter({'applause': 2}),
 1: Counter({'applause': 7}),
 2: Counter({'applause': 9}),
 3: Counter({'cheerfulness': 1, 'applause': 3}),
 4: Counter({'applause': 10, 'laughter': 1}),
 5: Counter({'applause': 10, 'laughter': 1}),
 6: Counter({'applause': 1}),
 7: Counter({'applause': 1}),
 8: Counter({'applause': 1}),
 9: Counter({'applause': 1}),
 10: Counter({'applause': 1}),
 11: Counter({'applause': 1}),
 12: Counter({'applause': 1}),
 13: Counter({'applause': 1}),
 14: Counter({'applause': 3}),
 15: Counter({'cheerfulness': 1, 'applause': 7}),
 16: Counter({'applause': 4}),
 17: Counter({'applause': 5, 'laughter': 1}),
 18: Counter({'applause': 6}),
 19: Counter({'applause': 9, 'laughter': 1}),
 20: Counter({'applause': 15}),
 21: Counter({'applause': 2}),
 22: Counter({'applause': 7}),
 23: Counter({'applause': 11}),
 24: Counter({'applause': 11}),
 25: Counter({'applause': 8}),
 26: Counter({'applause': 9}),
 27: Counter({'applause': 11}),
 28: Counter({'applause': 8}),
 29: Co

In [17]:
len(comment_counts)

371023

In [18]:
# get comment count by speech id
comment_counts[2]

3

In [None]:
# add columns for comment- and speaker gender count
df_comments['comment_count'] = comment_counts.values()
df_comments['comment_fem_speakers'] = comment_fem_speakers.values()
df_comments['comment_male_speakers'] = comment_male_speakers.values()
df_comments['reaction_count'] = reaction_counts.values()

In [95]:
df_comments.head()

Unnamed: 0,text,date,legislative_period,speaker_name,speaker_gender,speaker_role,speaker_party,comments,id,speaker_id,speech_length,topic_distribution,year,comment_count,comment_fem_speakers,comment_male_speakers,reaction_count
0,Sehr geehrter Herr Alterspräsident! So muss ...,2021-10-26,20,Gabriele Katzmarek,female,,SPD,[(Beifall bei der SPD sowie bei Abgeordneten d...,SP-2021-0,1299,2593,{68: 0.35882655},2021,0,0,0,{'applause': 2}
1,Herr Präsident! Liebe Kolleginnen und Kolleg...,2021-10-26,20,Stefan Müller,male,,CDU/CSU,[(Beifall bei der CDU/CSU sowie bei Abgeordnet...,SP-2021-1,4028,3796,{68: 0.46776655},2021,4,3,1,{'applause': 7}
2,Sehr geehrter Herr Alterspräsident Wolfgang ...,2021-10-26,20,Britta Haßelmann,female,,BÜNDNIS 90/DIE GRÜNEN,[(Beifall beim BÜNDNIS 90/DIE GRÜNEN und bei d...,SP-2021-2,451,4301,{68: 0.26328164},2021,3,1,2,{'applause': 9}
3,Sehr geehrter Herr Präsident! Meine lieben K...,2021-10-26,20,Marco Buschmann,male,,FDP,"[(Jan Korte [DIE LINKE]: Oder Jugendweihe!), (...",SP-2021-3,3083,4555,{},2021,1,0,1,"{'cheerfulness': 1, 'applause': 3}"
4,Herr Alterspräsident! Lassen Sie mich zunäch...,2021-10-26,20,Stephan Brandner,male,,AfD,"[(Beifall bei der AfD), (Beifall bei Abgeordne...",SP-2021-4,4055,5996,{68: 0.3120498},2021,4,3,1,"{'applause': 10, 'laughter': 1}"


In [96]:
df_comments["reaction_count"][15742] #["applause"]

Counter({'disgust': 1})

In [41]:
# test comment count and speaker classification per speech
RE_SPEAKER_DIRECTLY = re.compile(r'(?P<speaker>[\w]+[\w\s\.-]*) \[(?P<party>[\w,/\s]*)\]: (?P<comment>[\w,!\s\.\?\"]*)( - )*')

RES = [RE_SPEAKER_DIRECTLY]

for comment in df_comments["comments"][15742]: # insert speech id here
    for RE in RES:
        if re.search(RE, comment) is not None:
            for match in re.finditer(RE, comment):
                print(match.groupdict())
        else:
            print(f"no direct speaker comment: {comment}")

no direct speaker comment: (Zuruf.)
no direct speaker comment: (Widerspruch in der Mitte und rechts.)
no direct speaker comment: (Zurufe rechts und Gegenrufe links.)
no direct speaker comment: (Hört! Hört! bei der SPD.)
no direct speaker comment: (Hört! Hört! — Abg. Hilbert: Pfui! - Weitere Pfui-Rufe rechts. — Glocke des Präsidenten.)


## Calculating Means per Gender

In [206]:
comment_stats = {
    'female': [],
    'male': []
    }

In [149]:
# calculate average comment count per speech by gender
average_comments_by_gender = df_comments.groupby('speaker_gender')['comment_count'].mean()
average_comments_by_gender

speaker_gender
female    1.575616
male      1.580447
Name: comment_count, dtype: float64

In [150]:
# calculate average count of female commentators per speech by gender
female_commentators_by_gender = df_comments.groupby('speaker_gender')['comment_fem_speakers'].mean()
female_commentators_by_gender

speaker_gender
female    0.343319
male      0.329058
Name: comment_fem_speakers, dtype: float64

In [151]:
# calculate average count of male commentators per speech by gender
male_commentators_by_gender = df_comments.groupby('speaker_gender')['comment_male_speakers'].mean()
male_commentators_by_gender

speaker_gender
female    0.916745
male      0.760770
Name: comment_male_speakers, dtype: float64

In [218]:
# calculate average count of reactions per speech by gender
reactions_by_gender = df_comments.groupby('speaker_gender')['reaction_count']

avg_reactions_male_speeches = pd.json_normalize(reactions_by_gender.get_group("male")).mean()
avg_reactions_fem_speeches = pd.json_normalize(reactions_by_gender.get_group("female")).mean()

print(f"    female speeches:\n{avg_reactions_fem_speeches}\n    male speeches:\n{avg_reactions_male_speeches}")

    female speeches:
applause        3.388908
disquiet        1.140974
laughter        1.226942
cheerfulness    1.210445
approval        1.285630
disgust         1.080292
dtype: float64
    male speeches:
applause        3.616296
cheerfulness    1.495913
laughter        1.350136
approval        1.355791
disgust         1.217164
disquiet        1.254886
dtype: float64


### Assemble Dataframe for Stats

In [None]:
for gender in ['male', 'female']:
    comment_stats[gender].extend([average_comments_by_gender[gender], female_commentators_by_gender[gender], male_commentators_by_gender[gender]])

for reaction in all_reactions: # use list of reactions to ensure correct order
    comment_stats['male'].append(avg_reactions_male_speeches[reaction])
    comment_stats['female'].append(avg_reactions_fem_speeches[reaction])

comment_stats

In [219]:
# get column names
def get_columns():
    comment_stats = ['avg_comments_per_speech', 'avg_female_commentators', 'avg_male_commentators']
    reaction_stats = []

    for reaction in all_reactions:
        reaction_stats.append(f'avg_{reaction}')

    comment_stats.extend(reaction_stats)

    return comment_stats

get_columns()

['avg_comments_per_speech',
 'avg_female_commentators',
 'avg_male_commentators',
 'avg_applause',
 'avg_cheerfulness',
 'avg_laughter',
 'avg_approval',
 'avg_disgust',
 'avg_disquiet']

In [220]:
# create dataframe
df_comment_stats = pd.DataFrame.from_dict(comment_stats, orient='index', columns=get_columns())
df_comment_stats

Unnamed: 0,avg_comments_per_speech,avg_female_commentators,avg_male_commentators,avg_applause,avg_cheerfulness,avg_laughter,avg_approval,avg_disgust,avg_disquiet
female,1.575616,0.343319,0.916745,3.388908,1.210445,1.226942,1.28563,1.080292,1.140974
male,1.580447,0.329058,0.76077,3.616296,1.495913,1.350136,1.355791,1.217164,1.254886


## Plot Interjection Stats

In [None]:
# TODO