In [64]:
import pandas as pd
from pathlib import Path
from typing import List, Union

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

directory = "../BiH"
mpdf = pd.read_excel("/home/rupnik/parlamint/BiH/BiH_MPs_1998-2022_v1.xlsx")
partiesdf = pd.read_excel("/home/rupnik/parlamint/BiH/BiH_electoral_lists_1998-2022_v1.xlsx")

mpdf["party"] = mpdf.party.str.rstrip().str.lstrip()
partiesdf["party"] = partiesdf.party.str.rstrip().str.lstrip()
partiesdf["full_name"] = partiesdf.full_name.str.rstrip().str.lstrip()

## Check and impute missing values:

In [65]:
partiesnas = partiesdf.isna().any(axis="columns")
# No Nans in parties.

partiesdf[partiesnas]

Unnamed: 0,codeparty,term1,term2,party,full_name,entity,ethnic_affiliation,ideology_lr,party_family,election_result,no_seats,coalition,coalition_composition,ruling


In [None]:
mpnas = mpdf.isna().any(axis="columns")

mpdf[mpnas]

In [107]:
from utils import parse_meta_file
files = [ str(i.absolute()) for i in Path("../BiH").glob("*_meta.tsv")]

parties_in_meta = set()
fullnames = set()
for i in files:
    parties_in_meta = parties_in_meta.union(set(parse_meta_file(i).Speaker_party))
    fullnames = fullnames.union(set(parse_meta_file(i).Speaker_party_name))


In [108]:
parties_in_meta = {i.rstrip().lstrip() if not pd.isna(i) else "none" for i in parties_in_meta}

In [155]:
p = set(partiesdf.party)
m = set(mpdf.party)

print("In metadata - m", parties_in_meta-m)
print("In metadata - p", parties_in_meta-p)

In metadata - m {'Samostalni poslanik', 'KCM BiH', 'NS Radom za boljitak', 'HDZ 1990 - HSPBiH', 'none', 'DF – GS'}
In metadata - p {'Samostalni poslanik', 'KCM BiH', 'GDS', 'NS Radom za boljitak', 'HSS', 'HDZ 1990 - HSPBiH', 'none', 'DF – GS'}


In [157]:
# print("Parties in metadata: ", parties_in_meta, "\n\n")
# print("Parties in MP table: ", m, "\n\n")
print("Parties in parties table: ", p, "\n\n")

Parties in parties table:  {'Koalicija', 'DNS', 'HK', 'NS', 'SDS', 'PDP', 'NHI', 'NB', 'Sloga', 'DF', 'EB', 'BPS', 'DSP BiH', 'SBB', 'SPRS', 'PDA', 'KCD BiH', 'K-SNSD', 'NS RB', 'BOSS', 'PDP-NDP', 'RSRS', 'HZ', 'HDZ-HK-HNZ', 'SNSD', 'HDZ 1990', 'HDZ BiH', 'SPUBiH', 'SRSRS', 'NHI-HKDU', 'DNZ BiH', 'A-SDA', 'SNS', 'SDP BiH', 'SDUBiH', 'SBiH', 'SD BiH', 'SP', 'SDA'} 




In [158]:
print("Parties in MP table: ", m, "\n\n")

Parties in MP table:  {nan, 'Koalicija', 'DNS', 'NS', 'SDS', 'GDS', 'PDP', 'NHI', 'NB', 'Sloga', 'DF', 'EB', 'BPS', 'SBB', 'HSS', 'SPRS', 'DSP BiH', 'PDA', 'KCD BiH', 'K-SNSD', 'NS RB', 'BSP', 'BOSS', 'PDP-NDP', 'RSRS', 'HDZ-HK-HNZ', 'SNSD', 'HDZ 1990', 'HDZ BiH', 'SPUBiH', 'SRSRS', 'NHI-HKDU', 'DNZ BiH', 'A-SDA', 'SNS', 'SDP BiH', 'SDUBiH', 'SBiH', 'SD BiH', 'SP', 'SDA'} 




In [159]:
print("MP - Partiesdf: ", m-p)

MP - Partiesdf:  {nan, 'BSP', 'GDS', 'HSS'}


In [154]:


# Socijaldemokrati BiH have full name instead of abbreviation. Let's fix:
# mpdf.query("party == 'Socijaldemokrati BiH'" )
c = mpdf.party == "Socijaldemokrati BiH"
mpdf.loc[c, "party"] = "SD BiH"


# Again the same problem:
c = mpdf.party == 'Koalicija za cjelovitu i demokratsku BiH (SDA BiH, SBiH, Liberali BiH, GDS)'
mpdf.loc[c, "party"] = "KCD BiH"


# GDS: will have to be entered manually (see: https://en.wikipedia.org/wiki/Civic_Democratic_Party_(Bosnia_and_Herzegovina))

# `Koalicija ` [sic]: can be resolved by removing the whitespace:
c = mpdf.party == "Koalicija "
mpdf.loc[c, "party"] = "Koalicija"

# `Demokratska fronta - Željko Komšić` will be replaced by DF (demokratska fronta).
c = mpdf.party == "Demokratska fronta - Željko Komšić"
mpdf.loc[c, "party"] = "DF"

# 'NHI - HKDU' should be `'NHI-HKDU'`:
c = mpdf.party == 'NHI - HKDU'
mpdf.loc[c, "party"] = "NHI-HKDU"

# For `Samostojni poslanik` I'll set NaN:
c = mpdf.party == 'Samostalni poslanik'
mpdf.loc[c, "party"] = pd.np.nan

# Same for `none`:
c = mpdf.party == 'none'
mpdf.loc[c, "party"] = pd.np.nan

# Stjepan Krešić is the only instance of Hrvatska stranka Prava, but the only mention of this party is in Koalicija. I will therefore add him to the Koalicija:
c = mpdf.party == "HSP"
mpdf.loc[c, "party"] = "Koalicija"

# Halil	Mehtić	is the only one with party: BSP. On parlament.ba it is listed as BSP, so it can't be a typo for  Bosanska Patriotska stranka.
# Unsure what to do here.
# The only hit I find is Bosanskohercegovačka stranka prava 1861 (https://en.wikipedia.org/wiki/Party_of_Rights_of_Bosnia_and_Herzegovina_1861)
# likely I'll need to add it manually.
# mpdf[mpdf.party=='BSP']

# For 'SBB - Fahrudin Radončić', I'll swap it for SBB:
c = mpdf.party == 'SBB - Fahrudin Radončić'
mpdf.loc[c, "party"] = "SBB"

# Same for 'BPS - Sefer Halilović':
c = mpdf.party == 'BPS - Sefer Halilović'
mpdf.loc[c, "party"] = "BPS"

# HDZ 1990 – HSPBiH: 2 MPs, Like Stjepan Krešić I'll set them to Koalicija:
c = mpdf.party == "HDZ 1990 – HSPBiH"
mpdf.loc[c, "party"] = "Koalicija"


# 'NS Radom za boljitak' is simple, it's just full name for "NS RB":
c = mpdf.party == 'NS Radom za boljitak'
mpdf.loc[c, "party"] = "NS RB"

# 'Koalicija SNSD - DSP': only one instance in MPdf. I impute it with 'K-SNSD' based on the coalition composition attribute of this 
# party, on the terms matching, and on Wiki page of Željko Mirjanić
c = mpdf.party == 'Koalicija SNSD - DSP'
mpdf.loc[c, "party"] = "K-SNSD"

# based on https://en.wikipedia.org/wiki/Economic_Bloc, I change  'Ekonomski blok - HDU - Za boljitak' to Ekonomski blok
c = mpdf.party == 'Ekonomski blok - HDU - Za boljitak'
mpdf.loc[c, "party"] = "EB"

# With the same argument as a few lines above I set HSS to Koalicija. This only works for one of the 2 instances. For the other (Šimić, Ilija) I'll likely
# have to manually add HSS...
c = (mpdf.party == "HSS") & (mpdf.term2 == 7)
mpdf.loc[c, "party"] = "Koalicija"

c = (mpdf.party == 'HDZ BiH - Hrvatska koalicija - HNZ')
mpdf.loc[c, "party"] = "HDZ-HK-HNZ"

c = mpdf.party == 'DF – GS, Željko Komšić: BiH pobjeđuje!'
mpdf.loc[c, "party"] = "DF"


c = mpdf.party == 'Koalicija HDZ BiH, HSS, HKDU BiH, HSP DR. Ante Starčević, HSP Herceg-Bosne'
mpdf.loc[c, "party"] = "Koalicija"

# Later: manually add HSS, BSP, GDS

  mpdf.loc[c, "party"] = pd.np.nan
  mpdf.loc[c, "party"] = pd.np.nan


# Before we continue

Let's make sure that the abbreviations are formatted nicely and in the same way:

In [160]:
mpdf.party.unique()

array(['SDA', 'SDP BiH', 'HDZ BiH', 'SNS', 'SPRS', 'SRSRS', 'SNSD',
       'SBiH', 'SDS', 'GDS', 'NS RB', 'DNS', 'HSS', 'NHI', 'PDP',
       'HDZ 1990', 'Koalicija', 'DF', 'SBB', 'KCD BiH', 'RSRS', 'Sloga',
       'SD BiH', 'DNZ BiH', 'NHI-HKDU', 'BSP', 'DSP BiH', 'K-SNSD',
       'SDUBiH', 'SPUBiH', 'BOSS', 'EB', 'BPS', 'HDZ-HK-HNZ', nan,
       'A-SDA', 'PDP-NDP', 'NB', 'SP', 'NS', 'PDA'], dtype=object)

In [163]:
def nicefy_abbreviations(s: str):
    if isinstance(s, str):
        return s.replace(" ", "_")
    return s
mpdf["party"] = mpdf.party.apply(nicefy_abbreviations)
partiesdf["party"] = partiesdf.party.apply(nicefy_abbreviations)


mpdf.to_pickle(directory + "/" "mpdf_corrected.pickle")
partiesdf.to_pickle(directory + "/" "partiesdf_corrected.pickle")