In [None]:
from genie_master import GenieMaster

In [None]:
gm = GenieMaster(db_path="./chroma_qadata_db")

In [None]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_qadata_db")
collection = client.get_collection(name="langchain")

# Prepping vector store

In [None]:
# Loading in Data (skip this cell if already done)
import re
import pandas as pd

def preprocess_quote(quote):
    # Replace missing spaces with space
    preprocessed_quote = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", quote)

    # ...
    # add more if necessary

    return preprocessed_quote

df = pd.read_excel("data/qadata.xlsx")
df[["name", "party"]] = df.username.str.split(" - ", expand=True).apply(
    lambda x: x.str.strip()
)
df = df.drop(["username"], axis=1)

df["answer"] = df["answer"].apply(preprocess_quote)

# removing None values in "party"
df["party"] = df["party"].apply(lambda p: p if p else "Other")
df.to_pickle("./data/mod_qadata")
df.sample(5)

In [None]:
import pandas as pd
df = pd.read_pickle("./data/mod_qadata")
df

In [None]:
# sample_df = df.sample(20)
gm.transform_and_add_data(sample_df, page_content_column="answer")

In [None]:
# Zip database
!zip -r chroma_qadata_db.zip ./chroma_qadata_db

In [None]:
# delete the directory
!rm -rf ./chroma_qadata_db

In [None]:
!unzip chroma_qadata_db.zip

In [None]:
names = df.name.unique()
len(names)

# Retrieval QA

In [2]:
import random
import numpy as np
import pandas as pd

# Summoning the genie master :D
from genie_master import GenieMaster
gm = GenieMaster(db_path="./chroma_qadata_db")
gm.model_is_ready()

True

In [3]:
df = pd.read_pickle("./data/mod_qadata")

df_frequency = df.groupby(['name'])['answer'].count().reset_index(name='Count').sort_values(['Count'], ascending=False)
df_frequency.iloc[:10]


Unnamed: 0,name,Count
1229,Elizabeth Warren,614
1894,Joe Biden,419
279,Bernie Sanders,349
85,Amy Klobuchar,292
1072,Dianne Feinstein,279
2190,Kamala Harris,242
3361,Richard Durbin,181
2623,Marco Rubio,164
2527,Lindsey Graham,163
3285,Rand Paul,155


In [None]:
# sample_politicians = random.sample(df.name.unique() .tolist(), 5)
sample_politicians = df_frequency.sample(2).name.to_list()
sample_politicians

In [None]:
questions = [
    "Should abortion be legal?",
    "Should the government play a major role in regulating the economy?",
    "Should there be more restrictions on the current process of purchasing a gun?",
    "Should refugees be allowed to enter the United States?",
    "Should the U.S. government continue to provide assistance to Ukraine in the war with Russia?",
    "Should the government forgive student loan?"
]

In [None]:
# Create a list of tuples representing the combinations
combinations = [(x, y) for x in questions for y in sample_politicians]

# Create a DataFrame from the combinations
df_answer = pd.DataFrame(combinations, columns=['question', 'name'])

# Print the DataFrame
print(df_answer.iloc[:5])
print("Number of rows:", df_answer.shape[0])

In [None]:
from tqdm import tqdm
from tqdm.notebook import tqdm, tqdm_notebook
tqdm_notebook.pandas()

genies = {name: gm.get_genie(name) for name in sample_politicians}

In [None]:
import time
import asyncio

async def async_generate(row, genies):
    genie = genies[row["name"]]
    response = await genie.async_ask(row["question"])
    print(row["name"] + " | " + row["question"])
    print(response["result"]["answer"] + ": " + response["result"]["reasoning"])

async def generate_concurrently():
    tasks = [async_generate(row, genies) for index, row in df_answer.iterrows()]
    await asyncio.gather(*tasks)

def generate_serially():
    for index, row in df_answer.iterrows():
        genie = genies[row["name"]]
        response = genie.ask(row["question"])
        print(row["name"] + " | " + row["question"])
        print(response["result"]["answer"] + ": " + response["result"]["reasoning"])

s = time.perf_counter()
await generate_concurrently()
elapsed = time.perf_counter() - s
print("\033[1m" + f"Concurrent executed in {elapsed:0.2f} seconds." + "\033[0m")

print("\n\n")
s = time.perf_counter()
generate_serially()
elapsed = time.perf_counter() - s
print("\033[1m" + f"Serial executed in {elapsed:0.2f} seconds." + "\033[0m")

In [None]:
import time
import asyncio

loop = asyncio.get_event_loop()

async def async_generate(row, genies):
    genie = genies[row["name"]]
    response = await genie.async_ask(row["question"])
    # also write to excel sheet as i do this
    return response

async def main():
    tasks = [async_generate(row, genies) for _, row in df_answer.iterrows()]
    df_answer['response_async'] = await asyncio.gather(*tasks)

# https://stackoverflow.com/questions/67944791/fastest-way-to-apply-an-async-function-to-pandas-dataframe

s = time.perf_counter()
await main()
elapsed = time.perf_counter() - s
print("\033[1m" + f"Concurrent executed in {elapsed:0.2f} seconds." + "\033[0m")


print("\n\n")
s = time.perf_counter()
df_answer["response_serial"] = df_answer.progress_apply(
    lambda row: genies[row["name"]].ask(row["question"]),
    axis=1
)
elapsed = time.perf_counter() - s
print("\033[1m" + f"Serial executed in {elapsed:0.2f} seconds." + "\033[0m")

In [None]:
df_answer["answer"] = df_answer["response"].apply(lambda res: res["result"]["answer"])
df_answer["reasoning"] = df_answer["response"].apply(lambda res: res["result"]["reasoning"])
# source documents need further parsing
df_answer["source_documents"] = df_answer["response"].apply(lambda res: res["source_documents"])
df_answer["cost"] = df_answer["response"].apply(lambda res: res["total_cost"])
df_answer.sample(5)

In [None]:
df_answer.cost.sum()

In [None]:
df_answer.to_pickle("./vote_easy_sample_dtf")
df_answer[["name", "question", "answer", "reasoning"]].to_excel("vote_easy_sample_test.xlsx")