In [6]:
import pandas as pd
from pathlib import Path
from typing import List, Union

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

directory = "../SRB"
mp_path = Path(directory) / Path("Serbia_MPs_final_20221007.xlsx")
parties_path = Path(directory) / Path("Serbia_parties_final_20221007.xlsx")

In [7]:
mpdf = pd.read_excel(str(mp_path))#.dropna()
partiesdf = pd.read_excel(str(parties_path))

## Check and impute missing values:

In [8]:
partiesnas = partiesdf.isna().any(axis="columns")
# Only hit: SPDO, term 7.
# According to Wikipedia: Coalition of Democratic Party of Serbia – New Serbia – dr Vojislav Koštunica (DSS, NS, JS, SDPO)
# It doesn't say if SDPO got in coalition, but I can check the others.
# c1 = partiesdf.term2.astype(int) == 7
# c2 = partiesdf.party.isin(["DSS", "NS", "JS", "SPDO"])
# partiesdf[c1&c2]
# all others have ruling=1, so I'm imputing ruling=1 also for SPDO

partiesdf.loc[57, "ruling"] = 1
partiesdf[partiesnas]

Unnamed: 0,codeparty,term1,term2,party,full_name,established,chairman,ideology_LR,party_family,election_result,no_seats,coalition,coalition_composition,ruling
57,P58,2007-2008,7,SDPO,Srpski demokratski pokret obnove,2005,Vojislava Mihailovića/Veroljuba Stevanovića,4,6;7,16.55,2,1,JS-DSS-NS,1.0


In [9]:
mpnas = mpdf.isna().any(axis="columns")
# Nadije Bećiri: Poslanička grupa Ujedinjena dolina-SDA Sandžaka
# This has been identified as party SDA
# Assign:
mpdf.loc[2194, "party"] = "SDA"


# Vladan Glišić:  Poslanički klub: Narodni poslanici koji nisu članovi poslaničkih grupa 
# Can't impute this. Leaving NaN for now.

# Add birth year to row 113:
mpdf.loc[113, "date_of_birth"] = mpdf.loc[113, "year_of_birth"]

mpdf[mpnas]

Unnamed: 0,codemp,order_id,term1,term2,term_id,type_of_list,fullname,firstname,lastname,party,date_of_birth,year_of_birth,gender,place_of_birth,field_of_study,education_y,bp_lat,bp_lon
113,M179,114,1997-2000,4,114,various,"Kovačević, Dejan",Dejan,Kovačević,LK,1937,1937,0,Beograd,2,16,44.688045,20.22171
2194,M1315,2195,2020-2022,12,17,normal,"Bećiri, Nadije",Nadije,Bećiri,SDA,1961,1961,1,"Bujanovac, Lučane",4,16,42.448444,21.705894
2226,M1331,2227,2020-2022,12,49,normal,"Glišić, Vladan",Vladan,Glišić,,1970,1970,0,Priština,5,16,42.629566,21.108718


## Checking if the names are correctly segmented:

They are, no such pathologies as with 

I found entry

```
1957	Jovanović	Jovanović1, Nataša
```
Upon further inspection I also found the following pathological entries:

```
M103    Kovačević, Borisav        #-> Exists: https://otvoreniparlament.rs/poslanik/8455
        Kovačević, Borislav       #-> Only 2 hits in metadata. No hits on otvoreni parlament. Probably misspelling.
M1141   Nikolić Vukajlović, Vesna #-> Obviously wrong. No hits in the metadata or otvoreni parlament.
        Nikolić-Vukajlović, Vesna
M67     Hasanović Korać, Biljana  #-> Obviously wrong. No hits in the metadata or otvoreni parlament.
        Hasanović-Korać, Biljana
M781    Jovanović, Nataša         #-> On otvoreni parlament there are three hits, 
        Jovanović1, Nataša        #-> according to pictures two are the same person
M782    Jovanović2, Nataša        #-> 
```
This will be corrected in the first three cases. (In metadata as well as in MP table).

### Nataša
So Nataša Jovanović (1966) always has the same codeMP, but in one instance is listed as `Jovanović1, Nataša`.

The other Nataša Jovanović (1967)  has two codeMPs, with the same birth date. These I think I'll join. One entry (index 2250) has wrong gender (`0`), and the other (index `2176`) has no birth date and location.

### Aleksandar:

|     | codemp   | term1     | fullname             |   year_of_birth |
|----:|:---------|:----------|:---------------------|----------------:|
|  30 | M6       | 1997-2000 | Đorđević, Aleksandar |            1950 |
| 106 | M7       | 1997-2000 | Đorđević, Aleksandar |            1950 |
| 538 | M7       | 2003-2007 | Đorđević, Aleksandar |            1959 | 

Only the last one is findable on Otvoreni parlament. Something fishy seems to be happening here. So far drop first two lines and only keep the last instance.


### Other

`Nikolić Dragan`, `Lazić, Nikola` `Juhas, Atila` and `Stojanović-Plavšić Snežana` are clearly different people (all attributes differ)

`Ranđelović, Nebojša`, `Jevtović Vukojičić, Milanka`, `JefićBranković Sanja`, `Žarić Kovačević, Jelena` and `Đukić-Dejanović, Slavica` are the same person in all attributes.



In [10]:
mpdf.query("lastname == 'Pop-Lazić'")

Unnamed: 0,codemp,order_id,term1,term2,term_id,type_of_list,fullname,firstname,lastname,party,date_of_birth,year_of_birth,gender,place_of_birth,field_of_study,education_y,bp_lat,bp_lon


In [11]:
%%bash
# Replace all the wrong names in the metadata.
cd ../SRB
for file in $(ls *_meta.tsv)
do
    sed -i 's/Kovačević, Borislav/Kovačević, Borisav/g' $file
    sed -i 's/Nikolić Vukajlović, Vesna/Nikolić-Vukajlović, Vesna/g' $file
    sed -i 's/Hasanović Korać, Biljana/Hasanović-Korać, Biljana/g' $file
    sed -i 's/Vojić-Marković, Milica/Vojić Marković, Milica/g' $file
    sed -i 's/M793/M794/g' $file
    sed -i 's/M967/M966/g' $file
    sed -i 's/M940/M1338/g' $file
    sed -i 's/M784/M1380/g' $file
    sed -i 's/M1401/M70/g' $file
    sed -i 's/M1463/M441/g' $file
    	
    python /home/rupnik/parlamint/task16/003_edit_natasas.py $file
done


In [12]:
c = mpdf.fullname == "Kovačević, Borislav"
mpdf.loc[c, "fullname"] = "Kovačević, Borisav"
mpdf.loc[c, "firstname"] = "Borisav"

c = mpdf.fullname == "Nikolić Vukajlović, Vesna"
mpdf.loc[c, "fullname"] = "Nikolić-Vukajlović, Vesna"
mpdf.loc[c, "lastname"] = "Nikolić-Vukajlović"

c = mpdf.fullname == "Hasanović Korać, Biljana"
mpdf.loc[c, "fullname"] = "Hasanović-Korać, Biljana"
mpdf.loc[c, "lastname"] = "Hasanović-Korać"

c = mpdf.codemp == "M781"
mpdf.loc[c, "fullname"] = "Jovanović, Nataša"


# Dealing with twin Dragan Nikolićs:
mpdf.loc[66, "fullname"] = "Nikolić2, Dragan"
mpdf = mpdf.drop(index=[30, 106])

mpdf.loc[2176, "place_of_birth bp_lat bp_lon".split()] = mpdf.loc[2250, "place_of_birth bp_lat bp_lon".split()]
mpdf.loc[2250, "codemp"] = "M782"
mpdf.loc[2250, "gender"] = 1
mpdf.loc[2250, "fullname"] = "Jovanović2, Nataša"

mpdf.loc[979, "fullname"] = "Stojanović-Plavšić2, Snežana"

# Randelović Nebojša:
mpdf.loc[955, "codemp"] = "M794"

mpdf.loc[[1585, 1858], "fullname"] = "Todorović2, Dragan"
# JefićBranković Sanja:
mpdf.loc[1379, ["codemp", "date_of_birth"]] = "M1338", "19840101"

# Jevtović Vukojičič Milanka
mpdf.loc[2240, "codemp"] = "M617"

# Krstić Nenad
mpdf.loc[2278, "fullname"] = "Krstić2, Nenad"

# Lazić, Nikola
mpdf.loc[2284, "fullname"] = "Lazić2, Nikola"

# Mihailović Vacić, Nataša
mpdf.loc[2023, "codemp"] = "M1380"

# Pantić Pilja, Biljana:
mpdf.loc[2337, "codemp"] = "M70"

# Urošević Milan
mpdf.loc[2410, "fullname"] = "Urošević2, Milan"

# Žarić Kovačević, Jelena
mpdf.loc[2425, "codemp"] = "M441"
# Juhas Atila
mpdf.loc[2434, "fullname"] = "Juhas2, Atila"
# Marković Predrag
mpdf.loc[2439, "fullname"] = "Marković2, Predrag"

# Nikolić Zoran
mpdf.loc[67, "fullname"] = "Nikolić2, Zoran"

# Milica
c1 = mpdf.fullname == "Vojić Marković, Milica"
c2 = mpdf.fullname == "Vojić-Marković, Milica"

mpdf.loc[c1|c2, ["fullname", "lastname", "codemp"]] = "Vojić Marković, Milica", "Vojić Marković", "M637"

# Slavica Đukić-Dejanović
mpdf.loc[1041, "codemp"] = "M966"


In [13]:
mpdf[mpdf.fullname.str.casefold().str.contains("jovanović.*, nataša", regex=True)]

Unnamed: 0,codemp,order_id,term1,term2,term_id,type_of_list,fullname,firstname,lastname,party,date_of_birth,year_of_birth,gender,place_of_birth,field_of_study,education_y,bp_lat,bp_lon
577,M781,578,2003-2007,6,57,normal,"Jovanović, Nataša",Nataša,Jovanović,SRS,19660414,1966,1,Kragujevac,-,12,43.965392,20.823169
827,M781,828,2007-2008,7,58,normal,"Jovanović, Nataša",Nataša,Jovanović,SRS,19660414,1966,1,Kragujevac,-,12,43.965392,20.823169
1081,M781,1082,2008-2012,8,62,normal,"Jovanović, Nataša",Nataša,Jovanović,SRS,19660414,1966,1,Kragujevac,-,12,43.965392,20.823169
1957,M781,1958,2016-2020,11,59,normal,"Jovanović, Nataša",Nataša,Jovanović,SRS,19660414,1966,1,Kragujevac,-,12,43.965392,20.823169
2176,M782,2177,2016-2020,11,278,normal,"Jovanović2, Nataša",Nataša,Jovanović,SNS,19671016,1967,1,Kragujevac,4,16,43.965392,20.823169
2250,M782,2251,2020-2022,12,73,normal,"Jovanović2, Nataša",Nataša,Jovanović,SNS,19671016,1967,1,Kragujevac,4,18,43.965392,20.823169


### Let's turn to parties next

In [14]:
p = set(partiesdf.party)
m = set(mpdf.party)


In [15]:
from utils import parse_meta_file
files = [ str(i.absolute()) for i in Path("../SRB").glob("*_meta.tsv")]

parties_in_meta = set()
fullnames = set()
for i in files:
    parties_in_meta = parties_in_meta.union(set(parse_meta_file(i).Speaker_party))
    fullnames = fullnames.union(set(parse_meta_file(i).Speaker_party_name))

In [16]:
print("In metadata - m", parties_in_meta-m)
print("In metadata - p", parties_in_meta-p)

In metadata - m {'JUL', 'KAPD'}
In metadata - p {nan}


In [17]:
mpdf.party.unique()

array(['SRS', 'LK', 'LS', 'SPO', 'SVM', 'DA', 'SPS', '-', 'KV', 'DKPB',
       'DOS', 'DHSS', 'SSJ', 'LSV', 'DS', 'DSS', 'BDSS', 'G17 Plus', 'NS',
       'SDU', 'SDP', 'GSS', 'SLS', 'SLPS', 'URS', 'PDD', 'LDP', 'DSHV',
       'JS', 'RP', 'G17', 'PUPS', 'DLR', 'PVS', 'PZDD', 'ZZŠ', 'NL',
       'SNS', 'LSD', 'BDZ', 'SDA', 'LDP4', 'PŽK', 'PSSBK', 'ZS', 'BNS',
       'PPPS', 'BI', 'PVU', 'APPS', 'NSS', 'DPM', 'PS', 'BS', 'NОPO',
       'UG', 'SDS', 'USS', 'Nova', 'SNP', 'ZZS', 'PREOKRET', 'DJB', 'SPP',
       'Dveri', 'KP', 'SSP', 'ZES', 'AZP', nan, 'SNS '], dtype=object)

In [18]:
partiesdf[partiesdf.full_name == "G17 Plus"]

Unnamed: 0,codeparty,term1,term2,party,full_name,established,chairman,ideology_LR,party_family,election_result,no_seats,coalition,coalition_composition,ruling
39,P40,2003-2007,6,G17,G17 Plus,2002,Miroljub Labus,3,3,11.46,31,1,G17-SDP,1.0
54,P55,2007-2008,7,G17,G17 Plus,2002,Mlađan Dinkić,3,3,6.82,19,0,-,1.0
75,P76,2008-2012,8,G17,G17 Plus,2002,Mlađan Dinkić,3,3,38.42,24,1,DS-G17-LSV-SDP-SPO-DSHV,1.0
96,P97,2012-2014,9,G17,G17 Plus,2002,Mlađan Dinkić,3,3,5.51,10,1,G17-ZZS-NP,1.0


In [19]:
c = mpdf.party == "G17 Plus"
mpdf.loc[c, "party"] = "G17"


In [20]:
mpdf.to_pickle("mpdf_corrected.pickle")
partiesdf.to_pickle("partiesdf_corrected.pickle")

In [21]:
c= partiesdf.full_name.str.startswith("L")
partiesdf.full_name[c].unique()

array(['Liga socijaldemokrata Vojvodine', 'Lista za Sandžak',
       'Liga za Šumadiju', 'Liberalno-demokratska partija'], dtype=object)

In [22]:
"LK" in partiesdf.party

False

In [23]:
"-" in mpdf.party.values

True