In [1]:
import pandas as pd
from xml.etree import ElementTree as ET
import re
import pickle

In [2]:
# load data
df = pd.read_pickle('C:/Users/Ana/OneDrive - Hochschule Düsseldorf/MA/data_merged.pkl')

df.head()

Unnamed: 0,text,date,legislative_period,speaker_name,speaker_gender,speaker_role,speaker_party,comments,id,speaker_id,speech_length
0,Sehr geehrter Herr Alterspräsident! So muss ...,2021-10-26,20,Gabriele Katzmarek,female,,SPD,[(Beifall bei der SPD sowie bei Abgeordneten d...,SP-2021-0,1299,2593
1,Herr Präsident! Liebe Kolleginnen und Kolleg...,2021-10-26,20,Stefan Müller,male,,CDU/CSU,[(Beifall bei der CDU/CSU sowie bei Abgeordnet...,SP-2021-1,4028,3796
2,Sehr geehrter Herr Alterspräsident Wolfgang ...,2021-10-26,20,Britta Haßelmann,female,,BÜNDNIS 90/DIE GRÜNEN,[(Beifall beim BÜNDNIS 90/DIE GRÜNEN und bei d...,SP-2021-2,451,4301
3,Sehr geehrter Herr Präsident! Meine lieben K...,2021-10-26,20,Marco Buschmann,male,,FDP,"[(Jan Korte [DIE LINKE]: Oder Jugendweihe!), (...",SP-2021-3,3083,4555
4,Herr Alterspräsident! Lassen Sie mich zunäch...,2021-10-26,20,Stephan Brandner,male,,AfD,"[(Beifall bei der AfD), (Beifall bei Abgeordne...",SP-2021-4,4055,5996


In [3]:
# gender counts
df.groupby('speaker_gender')['speaker_id'].nunique()

speaker_gender
female    1057
male      3656
Name: speaker_id, dtype: int64

In [6]:
# parse XML file and save member data to pickle file for later use
filepath = 'C:/Users/Ana/OneDrive - Hochschule Düsseldorf/MA/data/MdB-Stammdaten/MDB_STAMMDATEN.XML'
member_data = []

data = ET.parse(filepath)
root = data.getroot()

# iterate over the <MDB> elements and extract their data
for member in root.iter('MDB'):

    # get speaker name(s)
    last_names = []
    full_names = []
    titles = []

    for name in member.iterfind('.//NAME'):
        last_name = name.find('NACHNAME').text
        first_name = name.find('VORNAME').text
        full_name = first_name + ' ' + last_name

        last_names.append(last_name)
        full_names.append(full_name)

        title = name.find('ANREDE_TITEL').text
        ac_title = name.find('AKAD_TITEL').text

        titles.append(title) if title is not None else None
        titles.append(ac_title) if title is not None else None

        
    # get gender and party
    gender = member.find('.//GESCHLECHT').text
    party = member.find('.//PARTEI_KURZ').text

    # get election periods
    periods = []
    for period in member.iterfind('.//WAHLPERIODE'):
        periods.append(int(period.find('WP').text))   
    
    # append all data to member_data
    member_data.append({
        'full_name': full_names,
        'first_name': first_name,
        'last_name': last_names,
        'gender': 'male' if gender == 'männlich' else 'female',
        'party': party if party is not None else None,
        'periods': periods if periods is not None else None,
        'titles': titles
    })
    
with open('data/member_data.pkl', 'wb') as f:
    pickle.dump(member_data, f)

member_data

[{'full_name': ['Manfred Abelein'],
  'first_name': 'Manfred',
  'last_name': ['Abelein'],
  'gender': 'male',
  'party': 'CDU',
  'periods': [5, 6, 7, 8, 9, 10, 11],
  'titles': ['Dr.', 'Prof. Dr.']},
 {'full_name': ['Ernst Achenbach'],
  'first_name': 'Ernst',
  'last_name': ['Achenbach'],
  'gender': 'male',
  'party': 'FDP',
  'periods': [3, 4, 5, 6, 7],
  'titles': ['Dr.', 'Dr.']},
 {'full_name': ['Annemarie Ackermann'],
  'first_name': 'Annemarie',
  'last_name': ['Ackermann'],
  'gender': 'female',
  'party': 'CDU',
  'periods': [2, 3, 4],
  'titles': []},
 {'full_name': ['Else Ackermann'],
  'first_name': 'Else',
  'last_name': ['Ackermann'],
  'gender': 'female',
  'party': 'CDU',
  'periods': [11, 12],
  'titles': ['Dr.', 'Dr.']},
 {'full_name': ['Ulrich Adam'],
  'first_name': 'Ulrich',
  'last_name': ['Adam'],
  'gender': 'male',
  'party': 'CDU',
  'periods': [12, 13, 14, 15, 16],
  'titles': []},
 {'full_name': ['Rudolf Adams'],
  'first_name': 'Rudolf',
  'last_name': ['

In [100]:
# number of unique speakers
df['speaker_id'].nunique()

4713

In [110]:
# compare members in original data with new member data and find deviations
no_name_matches = []
no_gender_matches = []
name_matches = []

faulty_names = [961, 348, 790]

for speaker_id in df['speaker_id'].unique():
    name = df.loc[df['speaker_id'] == speaker_id, 'speaker_name'].iloc[0]

    if speaker_id in faulty_names:
        without_ws = re.sub('(\s){1,}', '', name)
        name = re.sub(r"(\w)([A-Z])", r"\1 \2", without_ws)

    splitted_name = name.split()

    gender = df.loc[df['speaker_id'] == speaker_id, 'speaker_gender'].iloc[0]

    found_matching_member = False

    # iterate through member data
    for member in member_data:
        real_names = member['full_name']
        real_gender = member['gender']
        periods = member['periods']
        
        split_names = [name.split() for name in real_names]
        flat_names = [item for sublist in split_names for item in sublist]
        unique_names = list(set(flat_names))

        if len(set(unique_names) & set(splitted_name)) >= 2:
            found_matching_member = True
            print(f"Match found for df_speaker {name} and real speaker {real_names}")
            name_matches.append(speaker_id)

            if not gender == real_gender:
                #print(f"No correct gender for {name}")
                no_gender_matches.append(speaker_id)
    
    if not found_matching_member:
        no_name_matches.append(speaker_id)
        print(f"No match found for df_speaker {name} and real speaker {real_names}")

Match found for df_speaker Gabriele Katzmarek and real speaker ['Gabriele Katzmarek']
Match found for df_speaker Stefan Müller and real speaker ['Stefan Müller']
Match found for df_speaker Britta Haßelmann and real speaker ['Britta Haßelmann']
Match found for df_speaker Marco Buschmann and real speaker ['Marco Buschmann']
Match found for df_speaker Stephan Brandner and real speaker ['Stephan Brandner']
Match found for df_speaker Jan Korte and real speaker ['Jan Korte']
Match found for df_speaker Rolf Mützenich and real speaker ['Rolf Mützenich']
Match found for df_speaker Bärbel Bas and real speaker ['Bärbel Bas']
Match found for df_speaker Fabian Jacobi and real speaker ['Fabian Jacobi']
Match found for df_speaker Aydan Özoğuz and real speaker ['Aydan Özoğuz']
Match found for df_speaker Yvonne Magwas and real speaker ['Yvonne Magwas']
Match found for df_speaker Claudia Roth and real speaker ['Claudia Roth']
Match found for df_speaker Wolfgang Kubicki and real speaker ['Wolfgang Kubick

In [112]:
print(f'''
      Matched Parliament Members: {len(name_matches)}
      Not Matched: {len(no_name_matches)}
      Members with wrong Gender Classification: {len(no_gender_matches)}
      ''')


      Matched Parliament Members: 4338
      Not Matched: 486
      Members with wrong Gender Classification: 13
      


In [None]:
# print not-matched member names
for speaker in no_name_matches:
    name = df.loc[df['speaker_id'] == speaker, 'speaker_name'].iloc[0]
    print (f"{speaker}: {name}")

In [155]:
# print members with falsely classified gender
for speaker in no_gender_matches:
    print (df.loc[df['speaker_id'] == speaker, 'speaker_name'].iloc[0])

Gökay Akbulut
Gerrit Huy
Esra Limbacher
Merle Spellerberg
Misbah Khan
Erni Finselberger
Honor Funk
Cornelie Sonntag-Wolgast
Christina Schenk
Kersten Steinke
Hilde Mattheis
Ulli Nissen
Elvan Korkmaz


In [169]:
# replace false genders in dataframe
for speaker in no_gender_matches:
    rows = df[df['speaker_id'] == speaker]

    gender = df.loc[df['speaker_id'] == speaker, 'speaker_gender'].iloc[0]
    new_gender = 'male' if gender == 'female' else 'female'

    for index, row in rows.iterrows():
        df.loc[index, 'speaker_gender'] = new_gender

In [174]:
# check if successful
for speaker in no_gender_matches:
    print (df.loc[df['speaker_id'] == speaker, 'speaker_gender'].iloc[0])

female
female
male
female
female
female
male
female
male
female
female
female
female


In [176]:
# new gender counts
df.groupby('speaker_gender')['speaker_id'].nunique()

speaker_gender
female    1064
male      3649
Name: speaker_id, dtype: int64

In [175]:
# save corrected dataframe
df.to_pickle('C:/Users/Ana/OneDrive - Hochschule Düsseldorf/MA/data_merged_revised.pkl')