# Prep DF If needed

In [None]:
%%script false --no-raise-error

# Loading in Data (skip this cell if already done)
import re
import pandas as pd

def preprocess_quote(quote):
    # Replace missing spaces with space
    preprocessed_quote = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", quote)

    # ...
    # add more if necessary

    return preprocessed_quote

df = pd.read_excel("data/qadata.xlsx")
df[["name", "party"]] = df.username.str.split(" - ", expand=True).apply(
    lambda x: x.str.strip()
)
df = df.drop(["username"], axis=1)

df["answer"] = df["answer"].apply(preprocess_quote)

# removing None values in "party"
df["party"] = df["party"].apply(lambda p: p if p else "Other")
df['name'] = df.name.str.replace('&quot;', '"')
df.to_pickle("./data/mod_qadata")
df.to_excel("./data/mod_qadata.xlsx", index=False)
df.sample(5)

# Do Stuff

In [None]:
import os
if not os.path.exists("./chroma_qadata_db"):
    !unzip chroma_qadata_db.zip

In [1]:
import pandas as pd
import os

from vote_easy_genie_lib import *

US_STATES = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
    "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
    "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
    "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico",
    "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

vectorstore_path = "./chroma_qadata_db"

if os.path.exists("./data/all_people"):
    df_people = pd.read_pickle("./data/all_people")
else:
    df = pd.read_pickle("./data/mod_qadata")
    df_people = df.drop_duplicates(subset='name')[['name', 'party', 'usertitle']]
print("# of people:", df_people.shape[0])
df_people.sample(3)

# of people: 4281


Unnamed: 0,name,party,usertitle
32546,Angie Chen Button,R,State Representative District 112
36242,Bob Hasegawa,D,State Senator District 11
30087,Jessica Castleberry,R,State Senator District 35


# Questions

In [2]:
questions = [
    "Should the government reduce spending on medicare or social security programs?", # social security and medicare
    "Should the government forgive student loan?", # education
    "Should the government regulate what is being taught in school?", # education
    "Should abortion be legal?", # abortion
    "Should there be stricter gun control laws?", # gun control
    "Should the U.S. government maintain and possibly expand spending on foreign aid?", # foreign policy
    "Should there be stricter border control measures?", # immigration
    "Should LGBTQ issues be included in school curricula?", # LGBTQ, education
    "Do you support transgender individuals' access to gender-affirming healthcare?", # LGBTQ
    "Do you support qualified immunity for police officers?", # crime
]

# Senators

In [10]:
pattern = r'^(?i)(' + '|'.join(US_STATES) + r')\s+senator$'

# Filter rows with values that match the pattern
df_senators = df_people[df_people['usertitle'].str.match(pattern, na=False)]

print(df_senators.shape[0])
df_senators.iloc[:5]

95


Unnamed: 0,name,party,usertitle
661,Katie Britt,R,Alabama Senator
1126,Lisa Murkowski,R,Alaska Senator
1249,Dan Sullivan,R,Alaska Senator
1411,Mark Kelly,D,Arizona Senator
1416,Kyrsten Sinema,D,Arizona Senator


In [6]:
model_name="gpt-3.5-turbo"
output_csv = "./result/senator.csv"
output_xlsx="./result/senator.xlsx"
questions = questions[:2]

In [8]:
df_prompts = get_df_prompts(df_senators, questions)
df_results = await get_df_results(df_prompts, vectorstore_path, output_csv, output_xlsx, model_name)
df_results.iloc[:3]

Number of rows to be processed: 10
Estimated cost: $0.016
Genie Master initialized at: 2023-08-28 02:52:07.868325-04:00


Processing batches (101 rows per batch): 100%|██████████| 1/1 [00:07<00:00,  7.74s/it]

Cost exceeded 0.01: 0.0111
Total cost: 0.0110545





Unnamed: 0,name,party,usertitle,question,answer,reasoning,evidence,source_content,source_category,source_sub_category,cost
0,Richard Durbin,D,Illinois Senator,should the government reduce spending on medic...,no,Richard Durbin opposes efforts to privatize So...,[I have opposed efforts to partially privatize...,I have opposed efforts to partially privatize ...,"Seniors, Social Security, Medicare, Pensions",Privatizing Social Security,0.001277
1,Richard Durbin,D,Illinois Senator,should the government forgive student loan?,unknown,"Based on the given context, it is unclear whet...",[Richard Durbin has advocated for lowering stu...,that all students can receive a college educat...,Education & Schools,Higher Education,0.001231
2,Edward J. Markey,D,Massachusetts Senator,should the government reduce spending on medic...,no,Edward J. Markey believes that we must fight a...,[In addition to providing affordable health ca...,the middle class in history. The massive benef...,"Seniors, Social Security, Medicare, Pensions",Privatizing Social Security,0.001309


### clean up leftovers

In [None]:
df_finished_prompts = pd.read_csv("result/senators_08_26.csv", header=None)
df_finished_prompts.columns = get_result_column_names()
print("Finished rows number:", df_finished_prompts.shape[0])

In [11]:
df_prompts = get_df_prompts(df_senators, questions)
df_remaining = get_df_remaining_prompts(df_prompts, df_finished_prompts)
print("Remaining rows number:", df_remaining.shape[0])
df_results = await get_df_results(df_remaining, vectorstore_path, output_csv, output_xlsx, model_name)

120


Unnamed: 0,name,party,usertitle,question
1,Katie Britt,R,Alabama Senator,should the government forgive student loan?
2,Lisa Murkowski,R,Alaska Senator,should the government reduce spending on medic...
3,Lisa Murkowski,R,Alaska Senator,should the government forgive student loan?


# Congress

# Governor

# 

# Delete Database to save space

In [None]:
!rm -rf ./chroma_qadata_db