# Create datasets for congressional CEJA stances

In [21]:
# imports
from pandasql import sqldf
import pandas as pd
from fuzzywuzzy import fuzz

In [22]:
# load data
ceja_supporters_reps = pd.read_csv('data/government/illinois/votes/ceja-sponsors-reps-raw.tsv', sep='\t', comment="#")
ceja_supporters_sens = pd.read_csv('data/government/illinois/votes/ceja-sponsors-sens-raw.tsv', sep='\t', comment="#")

reps = pd.read_csv('data/government/illinois/votes/reps-102.tsv', sep='\t', comment="#")
sens = pd.read_csv('data/government/illinois/votes/sens-102.tsv', sep='\t', comment="#")

In [23]:
len(ceja_supporters_sens)

34

In [24]:
# use fuzzy strings to match politician names
# write the version in the rep/sen name CSV as a source of truth
# https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe
# TODO sen join seems wrong? there's 34 sens in the list of names of sponsors, but less than that in the join
# maybe some sponsors are no longer in office, so not in that list of sens?

for rep in reps['name']:
    for i in range(len(ceja_supporters_reps.index)):
        if fuzz.partial_ratio(rep, ceja_supporters_reps['name'][i]) > 90:
            print(f"matched {rep} to {ceja_supporters_reps['name'][i]}")
            ceja_supporters_reps['name'][i] = rep
            break
                  
for sen in sens['name']:
    for i in range(len(ceja_supporters_sens.index)):
        if fuzz.partial_ratio(sen, ceja_supporters_sens['name'][i]) > 90:
            print(f"matched {sen} to {ceja_supporters_sens['name'][i]}")
            ceja_supporters_sens['name'][i] = sen
            break

matched Carol Ammons to Carol Ammons
matched Jaime M. Andrade, Jr. to Jaime M. Andrade, Jr.
matched Kambium Buckner to Kambium Buckner
matched Kelly M. Burke to Kelly M. Burke
matched Jonathan Carroll to Jonathan Carroll
matched Kelly M. Cassidy to Kelly M. Cassidy
matched Lakesia Collins to Lakesia Collins
matched Deb Conroy to Deb Conroy
matched Terra Costa Howard to Terra Costa Howard
matched Anthony DeLuca to Anthony DeLuca
matched Daniel Didech to Daniel Didech
matched Robyn Gabel to Robyn Gabel
matched Jennifer Gong-Gershowitz to Jennifer Gong-Gershowitz
matched Edgar Gonzalez, Jr. to Edgar Gonzalez, Jr.
matched Will Guzzardi to Will Guzzardi
matched Michael Halpin to Michael Halpin
matched Sonya M. Harper to Sonya M. Harper
matched Barbara Hernandez to Barbara Hernandez
matched Elizabeth Hernandez to Elizabeth Hernandez
matched Frances Ann Hurley to Frances Ann Hurley
matched Lindsey LaPointe to Lindsey LaPointe
matched Theresa Mah to Theresa Mah
matched Natalie A. Manley to Nat

In [25]:
q_reps = """
SELECT reps.name, reps.district, 'True' as supports_ceja
FROM reps, ceja_supporters_reps
WHERE reps.name = ceja_supporters_reps.name
UNION
SELECT reps.name, reps.district, 'False' as supports_ceja
FROM reps
WHERE NOT EXISTS (
    SELECT NULL
    FROM ceja_supporters_reps
    WHERE reps.name = ceja_supporters_reps.name
)
"""
ceja_support_reps = sqldf(q_reps, globals())

In [26]:
q_sens = """
SELECT sens.name, sens.district, 'True' as supports_ceja
FROM sens, ceja_supporters_sens
WHERE sens.name = ceja_supporters_sens.name
UNION
SELECT sens.name, sens.district, 'False' as supports_ceja
FROM sens
WHERE NOT EXISTS (
    SELECT NULL
    FROM ceja_supporters_sens
    WHERE sens.name = ceja_supporters_sens.name
)
"""
ceja_support_sens = sqldf(q_sens, globals())

In [27]:
len(ceja_support_sens[ceja_support_sens['supports_ceja']=="False"])

32

In [28]:
ceja_support_reps.to_csv('data/government/illinois/ceja-sponsors-reps.tsv', sep="\t", index=False)
ceja_support_sens.to_csv('data/government/illinois/ceja-sponsors-sens.tsv', sep="\t", index=False)

# Processing coordinate data

In [29]:
import re

def convertDMSToDD(degrees: str, minutes: str, seconds: str, direction: str):
	dd = float(degrees) + (float(minutes)/60) + (float(seconds)/(60*60))

	if direction == "S" or direction == "W":
		dd *= -1

	return dd

def castDMS(input: str):
	"""
	cast input from coord to decimal

	old regex: /\d+(\.\d+)?°(\d+(\.\d+)?′)?(\d+(\.\d+)?″)?[NESW]/
	"""
	parts = re.split("[^\d\w\.]+", input)

	if re.match("\d+(\.\d+)?°\d+(\.\d+)?′\d+(\.\d+)?″[NESW]", input):
		# 38°16′40.2″N
		return convertDMSToDD(parts[0], parts[1], parts[2], parts[3])
	elif re.match("\d+(\.\d+)?°\d+(\.\d+)?′[NESW]", input):
		# 38.204°N
		return convertDMSToDD(parts[0], parts[1], 0, parts[2])
	elif re.match("\d+(\.\d+)?°[NESW]", input):
		# 38°16.12N
		return convertDMSToDD(parts[0], 0, 0, parts[1])
	else:
		# it's something else, just return original input
		return input

In [43]:
import pandas as pd

csv_path = "data/energy/illinois/natural_gas_plants.tsv"
df = pd.read_csv(csv_path, delimiter="\t")

if 'coordinates' in df:
    df['yCoordinate'] = df.apply(lambda row: castDMS(row.coordinates.split(' ')[0]), axis=1)
    df['xCoordinate'] = df.apply(lambda row: castDMS(row.coordinates.split(' ')[1]), axis=1)
    df.to_csv(csv_path, sep="\t", index=False)
else:
    print("'coordinates' column not found in the file")