# Parsing Environmental Scorecard Data

In [None]:
# imports
from pandasql import sqldf
import pandas as pd
from fuzzywuzzy import fuzz

In [None]:
# load data
iec_scores_reps = pd.read_csv('../data/government/illinois/legislature/iec/iec-scores-reps-2022.tsv', sep='\t', comment="#")
iec_scores_sens = pd.read_csv('../data/government/illinois/legislature/iec/iec-scores-sens-2022.tsv', sep='\t', comment="#")

reps = pd.read_csv('../data/government/illinois/legislature/reps-102.tsv', sep='\t', comment="#")
sens = pd.read_csv('../data/government/illinois/legislature/sens-102.tsv', sep='\t', comment="#")

In [None]:
# use fuzzy strings to match politician names
# NOTE: the matching here doesn't catch every match - between this cell and the next, I manually finish matching the names in the CSV
# write the version in the rep/sen name CSV as a source of truth
# https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe

iec_scores_reps['name'] = None
iec_scores_sens['name'] = None

for rep in reps['name']:
    for i in range(len(iec_scores_reps.index)):
        scorecard_name = iec_scores_reps['name_raw'][i]

        if fuzz.partial_ratio(rep, scorecard_name) > 90:
            iec_scores_reps['name'][i] = rep
            break
                  
for sen in sens['name']:
    for i in range(len(iec_scores_sens.index)):
        scorecard_name = iec_scores_sens['name_raw'][i]

        if fuzz.partial_ratio(sen, scorecard_name) > 90:
            iec_scores_sens['name'][i] = sen
            break

iec_scores_reps.to_csv('../data/government/illinois/legislature/iec/iec-scores-reps-2022.tsv', sep="\t", index=False)
iec_scores_sens.to_csv('../data/government/illinois/legislature/iec/iec-scores-sens-2022.tsv', sep="\t", index=False)

In [None]:
# NOTE: not all names were matched by fuzzy matching, I will manually join the remaining name entries before performing this join
# this might be sacrilege, I'll note it here -> John D'Amico resigned, I'll fill his parsed "name" entry with the person now in his seat (Michael Kelly)
iec_scores_reps = pd.read_csv('../data/government/illinois/legislature/iec/iec-scores-reps-2022.tsv', sep='\t', comment="#")
iec_scores_sens = pd.read_csv('../data/government/illinois/legislature/iec/iec-scores-sens-2022.tsv', sep='\t', comment="#")

if any(iec_scores_reps['name'].isnull()):
    print(iec_scores_reps[iec_scores_reps['name'].isnull()])
    raise Exception("not all names matched)")
else:
    reps_district_join = reps[['name', 'district']]
    iec_scores_reps = iec_scores_reps.merge(reps_district_join, on='name', how='left', suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)').astype({'district': 'int32'})

if any(iec_scores_sens['name'].isnull()):
    print(iec_scores_sens[iec_scores_sens['name'].isnull()])
    raise Exception("not all names matched)")
else:
    sens_district_join = sens[['name', 'district']]
    iec_scores_sens = iec_scores_sens.merge(sens_district_join, on='name', how='left', suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)').astype({'district': 'int32'})

iec_scores_reps.to_csv('../data/government/illinois/legislature/iec/iec-scores-reps-2022.tsv', sep="\t", index=False)
iec_scores_sens.to_csv('../data/government/illinois/legislature/iec/iec-scores-sens-2022.tsv', sep="\t", index=False)