# Processing Data for SB2408 Votes

In [None]:
# imports
from pandasql import sqldf
import pandas as pd
from fuzzywuzzy import fuzz

In [95]:
ceja_votes_reps = pd.read_csv('../data/government/illinois/legislature/ceja/sb2408-votes-house-raw.tsv', sep='\t', comment="#")
ceja_votes_sens = pd.read_csv('../data/government/illinois/legislature/ceja/sb2408-votes-senate-raw.tsv', sep='\t', comment="#")

reps = pd.read_csv('../data/government/illinois/legislature/reps-102.tsv', sep='\t', comment="#")
sens = pd.read_csv('../data/government/illinois/legislature/sens-102.tsv', sep='\t', comment="#")

In [97]:
# use fuzzy strings to match politician names
# NOTE: the matching here doesn't catch every match - between this cell and the next, I manually finish matching the names in the CSV
# write the version in the rep/sen name CSV as a source of truth
# https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe

ceja_votes_reps['name'] = None
ceja_votes_sens['name'] = None

for rep in reps['name']:
    for i in range(len(ceja_votes_reps.index)):
        parsed_name = ceja_votes_reps['name_raw'][i][5:].split('[')[0]
        ln, fn = parsed_name.split(', ')
        rep_name = f"{fn} {ln}"

        if fuzz.partial_ratio(rep, rep_name) > 90:
            ceja_votes_reps['name'][i] = rep
            break
                  
for sen in sens['name']:
    for i in range(len(ceja_votes_sens.index)):
        parsed_name = ceja_votes_sens['name_raw'][i][5:].split('[')[0]
        ln, fn = parsed_name.split(', ')
        sen_name = f"{fn} {ln}"

        if fuzz.partial_ratio(sen, sen_name) > 90:
            ceja_votes_sens['name'][i] = sen
            break

ceja_votes_reps.to_csv('../data/government/illinois/legislature/ceja/sb2408-votes-house.tsv', sep="\t", index=False)
ceja_votes_sens.to_csv('../data/government/illinois/legislature/ceja/sb2408-votes-senate.tsv', sep="\t", index=False)

In [107]:
# NOTE: not all names were matched by fuzzy matching, I will manually join the remaining name entries before performing this join
# this might be sacrilege, I'll note it here -> John D'Amico resigned, I'll fill his parsed "name" entry with the person now in his seat (Michael Kelly)
ceja_votes_reps = pd.read_csv('../data/government/illinois/legislature/ceja/sb2408-votes-house.tsv', sep='\t', comment="#")
ceja_votes_sens = pd.read_csv('../data/government/illinois/legislature/ceja/sb2408-votes-senate.tsv', sep='\t', comment="#")

if any(ceja_votes_reps['name'].isnull()):
    print(ceja_votes_reps[ceja_votes_reps['name'].isnull()])
    raise Exception("not all names matched)")
else:
    reps_district_join = reps[['name', 'district']]
    ceja_votes_reps = ceja_votes_reps.merge(reps_district_join, on='name', how='left', suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)').astype({'district': 'int32'})

if any(ceja_votes_sens['name'].isnull()):
    print(ceja_votes_sens[ceja_votes_sens['name'].isnull()])
    raise Exception("not all names matched)")
else:
    sens_district_join = sens[['name', 'district']]
    ceja_votes_sens = ceja_votes_sens.merge(sens_district_join, on='name', how='left', suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)').astype({'district': 'int32'})

ceja_votes_reps.to_csv('../data/government/illinois/legislature/ceja/sb2408-votes-house.tsv', sep="\t", index=False)
ceja_votes_sens.to_csv('../data/government/illinois/legislature/ceja/sb2408-votes-senate.tsv', sep="\t", index=False)