# Prepare Datasets for CEJA sponsorship

In [None]:
# imports
from pandasql import sqldf
import pandas as pd
from fuzzywuzzy import fuzz

In [None]:
# load data
ceja_supporters_reps = pd.read_csv('../data/government/illinois/legislature/ceja/ceja-sponsors-reps.tsv', sep='\t', comment="#")
ceja_supporters_sens = pd.read_csv('../data/government/illinois/legislature/ceja/ceja-sponsors-sens.tsv', sep='\t', comment="#")

reps = pd.read_csv('../data/government/illinois/legislature/reps-102.tsv', sep='\t', comment="#")
sens = pd.read_csv('../data/government/illinois/legislature/sens-102.tsv', sep='\t', comment="#")

In [None]:
# use fuzzy strings to match politician names
# write the version in the rep/sen name CSV as a source of truth
# https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe
# TODO sen join seems wrong? there's 34 sens in the list of names of sponsors, but less than that in the join
# maybe some sponsors are no longer in office, so not in that list of sens?

for rep in reps['name']:
    for i in range(len(ceja_supporters_reps.index)):
        if fuzz.partial_ratio(rep, ceja_supporters_reps['name'][i]) > 90:
            print(f"matched {rep} to {ceja_supporters_reps['name'][i]}")
            ceja_supporters_reps['name'][i] = rep
            break
                  
for sen in sens['name']:
    for i in range(len(ceja_supporters_sens.index)):
        if fuzz.partial_ratio(sen, ceja_supporters_sens['name'][i]) > 90:
            print(f"matched {sen} to {ceja_supporters_sens['name'][i]}")
            ceja_supporters_sens['name'][i] = sen
            break

In [None]:
q_reps = """
SELECT reps.name, reps.district, 'True' as supports_ceja
FROM reps, ceja_supporters_reps
WHERE reps.name = ceja_supporters_reps.name
UNION
SELECT reps.name, reps.district, 'False' as supports_ceja
FROM reps
WHERE NOT EXISTS (
    SELECT NULL
    FROM ceja_supporters_reps
    WHERE reps.name = ceja_supporters_reps.name
)
"""
ceja_support_reps = sqldf(q_reps, globals())

In [None]:
q_sens = """
SELECT sens.name, sens.district, 'True' as supports_ceja
FROM sens, ceja_supporters_sens
WHERE sens.name = ceja_supporters_sens.name
UNION
SELECT sens.name, sens.district, 'False' as supports_ceja
FROM sens
WHERE NOT EXISTS (
    SELECT NULL
    FROM ceja_supporters_sens
    WHERE sens.name = ceja_supporters_sens.name
)
"""
ceja_support_sens = sqldf(q_sens, globals())

In [None]:
ceja_support_reps.to_csv('../data/government/illinois/legislature/ceja/ceja-sponsorship-reps.tsv', sep="\t", index=False)
ceja_support_sens.to_csv('../data/government/illinois/legislature/ceja/ceja-sponsorship-sens.tsv', sep="\t", index=False)

In [95]:
# load data
iec_scores_reps = pd.read_csv('../data/government/illinois/legislature/ceja/sb2408-votes-house-raw.tsv', sep='\t', comment="#")
ceja_votes_sens = pd.read_csv('../data/government/illinois/legislature/ceja/sb2408-votes-senate-raw.tsv', sep='\t', comment="#")

reps = pd.read_csv('../data/government/illinois/legislature/reps-102.tsv', sep='\t', comment="#")
sens = pd.read_csv('../data/government/illinois/legislature/sens-102.tsv', sep='\t', comment="#")