In [1]:
import os
if not os.path.exists("./chroma_qadata_db"):
    !unzip chroma_qadata_db.zip

Archive:  chroma_qadata_db.zip
   creating: chroma_qadata_db/
  inflating: chroma_qadata_db/.DS_Store  
   creating: chroma_qadata_db/f48ec261-1a58-4676-9b1e-c9ed0526597a/
  inflating: chroma_qadata_db/f48ec261-1a58-4676-9b1e-c9ed0526597a/data_level0.bin  
  inflating: chroma_qadata_db/f48ec261-1a58-4676-9b1e-c9ed0526597a/length.bin  
  inflating: chroma_qadata_db/f48ec261-1a58-4676-9b1e-c9ed0526597a/link_lists.bin  
  inflating: chroma_qadata_db/f48ec261-1a58-4676-9b1e-c9ed0526597a/header.bin  
  inflating: chroma_qadata_db/f48ec261-1a58-4676-9b1e-c9ed0526597a/index_metadata.pickle  
  inflating: chroma_qadata_db/chroma.sqlite3  


In [2]:
from genie_master import GenieMaster
import pandas as pd
import asyncio
import aiohttp
import time

US_STATES = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
    "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
    "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
    "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico",
    "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

db_path = "./chroma_qadata_db"
gm = GenieMaster(db_path=db_path)

df = pd.read_pickle("./data/mod_qadata")
df['output'] = None
df_people = df.drop_duplicates(subset=['name'])[['name', 'party', 'usertitle']]
print("# of people:", df_people.shape[0])
df_people.sample(3)

Genie Master initialized at: 2023-08-26 00:54:53.694433-04:00
# of people: 4281


Unnamed: 0,name,party,usertitle
10164,Susan M. &quot;Sue&quot; Rezin,R,State Senator District 38
18362,Bill Kinkade,R,State Representative District 52
30628,Susan M. Lynn,R,State Representative District 57


In [8]:
async def process_row(session, genie, row, output_path):
    output = await genie.async_ask(row["question"])
    result = output["result"]

    row['answer'] = result.get('answer', "")
    row['reasoning'] = result.get('reasoning', "")
    row['evidence'] = result.get('evidence', "")
    
    if output["source_documents"]:
        source_doc = output["source_documents"][0]
        row['source_content'] = source_doc['source_content']
        row['source_category'] = source_doc['source_category']
        row['source_sub_category'] = source_doc['source_sub_category']
    
    row['cost'] = output["total_cost"]
    new_row_df = pd.DataFrame([row])
    new_row_df.to_csv(output_path, mode='a', header=False, index=False)

    return row

async def process_rows(df, genies, output_path, include_header=False):
    
    async with aiohttp.ClientSession() as session:
        tasks = []
        for _, row in df.iterrows():
            genie = genies[row["name"]]
            task = process_row(session, genie, row.copy(), output_path)
            tasks.append(task)
        processed_rows = await asyncio.gather(*tasks)
    
    processed_df = pd.DataFrame(processed_rows)
    processed_df.to_csv(output_path, index=False)  # Save as CSV

    return processed_df
            
async def main(df, model_name="gpt-3.5-turbo", output_file='./result/output.csv', clear_file=True):
    genies = {name: gm.get_genie(name, model_name=model_name) for name in df.name.values}
    if clear_file:
        open(output_file, 'w').close() # erase content inside csv file
    return await process_rows(df, genies, output_file)

In [3]:
def seq_process_row(row, genie):
    output = genie.ask(row["question"])
    result = output["result"]

    row['answer'] = result.get('answer', "")
    row['reasoning'] = result.get('reasoning', "")
    row['evidence'] = result.get('evidence', "")
    
    if output["source_documents"]:
        source_doc = output["source_documents"][0]
        row['source_content'] = source_doc['source_content']
        row['source_category'] = source_doc['source_category']
        row['source_sub_category'] = source_doc['source_sub_category']
    
    row['cost'] = output["total_cost"]
    
    return row

def seq_main(df, batch_size=10):
    output_excel_path = './output.xlsx'
    genies = {name: gm.get_genie(name) for name in df.name.values}
    df_results = pd.DataFrame()
    for _, row in df.iterrows():
        df_results = pd.concat([df_results, seq_process_row(row, genies[row['name']])])
        if df_results.shape[0] % batch_size == 0:
            df_results.to_excel(output_excel_path)
    df_results.to_excel(output_excel_path)
    df = df_results


# Questions

In [4]:
questions = [
    "Should abortion be legal?",
    "Should the government play a major role in regulating the economy?",
    "Should there be more restrictions on the current process of purchasing a gun?",
    "Should refugees be allowed to enter the United States?",
    "Should the U.S. government continue to provide assistance to Ukraine in the war with Russia?",
    "Should the government forgive student loan?"
]

In [5]:
people_tuples = df_people.values.tolist()
combinations = [tup + [q] for q in questions for tup in people_tuples]
df_all = pd.DataFrame(combinations, columns=['name', 'party', 'usertitle', 'question'])

print("Number of rows:", df_all.shape[0])
df_all.iloc[:5]

Number of rows: 25686


Unnamed: 0,name,party,usertitle,question
0,Joe Biden,D,US President,Should abortion be legal?
1,Kamala Harris,D,US Vice President,Should abortion be legal?
2,Katie Britt,R,Alabama Senator,Should abortion be legal?
3,Jerry Carl,R,Alabama Congressional District 1,Should abortion be legal?
4,Barry Moore,R,Alabama Congressional District 2,Should abortion be legal?


# Senators

In [6]:
pattern = r'^(?i)(' + '|'.join(US_STATES) + r')\s+senator$'

# Filter rows with values that match the pattern
df_senators = df_all[df_all['usertitle'].str.match(pattern, na=False)].sample(5).copy(deep=True)
df_senators

Unnamed: 0,name,party,usertitle,question
7833,John Cornyn,R,Texas Senator,Should the government play a major role in reg...
17472,John Hickenlooper,D,Colorado Senator,Should the U.S. government continue to provide...
10749,Pete Ricketts,R,Nebraska Senator,Should there be more restrictions on the curre...
8091,Bernie Sanders,I,Vermont Senator,Should the government play a major role in reg...
16396,Ted Cruz,R,Texas Senator,Should refugees be allowed to enter the United...


In [9]:
model_name="gpt-3.5-turbo"
output_file="./result/senators.csv"
s = time.perf_counter()
df_senators = await main(df_senators, model_name=model_name, output_file=output_file, clear_file=True)
elapsed = time.perf_counter() - s
print("\033[1m" + f"Concurrent executed in {elapsed:0.2f} seconds." + "\033[0m")

[1mConcurrent executed in 9.71 seconds.[0m


In [11]:
pd.read_csv(output_file)

Unnamed: 0,name,party,usertitle,question,answer,reasoning,evidence,source_content,source_category,source_sub_category,cost
0,John Cornyn,R,Texas Senator,Should the government play a major role in reg...,no,John Cornyn believes in limited government int...,['The Obama Administration’s economic agenda o...,Our nation continues to struggle through a sta...,"Jobs, Economy, Trade, Business, Industry & Agr...",Jobs,0.001317
1,John Hickenlooper,D,Colorado Senator,Should the U.S. government continue to provide...,unknown,There is no evidence in the provided context t...,[],"Affordable College, Apprenticeships, and Skill...",Reasons & Objectives,Goals If Elected,0.001164
2,Pete Ricketts,R,Nebraska Senator,Should there be more restrictions on the curre...,no,Pete Ricketts opposes further restrictions on ...,['Pete will work hard in the Senate with his f...,"include gun registration, and support judges w...",Gun & Property Rights,Gun Legislation and Control,0.001191
3,Bernie Sanders,I,Vermont Senator,Should the government play a major role in reg...,yes,Bernie Sanders believes that the government sh...,['He has also proposed barring banks’ chief ex...,saying the six biggest ones wield too much con...,"Labor, Wages & Unions",Wages,0.001525
4,Ted Cruz,R,Texas Senator,Should refugees be allowed to enter the United...,unknown,There is no direct quote from Ted Cruz regardi...,[],expected.www.ontheissues.org/International/Ted...,"Immigration, Border Security, Terrorism & Home...",Citizenship for Illegal Immigrants,0.000972


# Congress

# Governor

# 

# Delete Database to save space

In [3]:
!rm -rf ./chroma_qadata_db