In [1]:
from genie_master import GenieMaster

In [2]:
gm = GenieMaster(db_path="./chroma_qadata_db")
name = "Joe Biden"

In [None]:
from chromaviz import visualize_collection

visualize_collection(gm.get_vectorstore()._collection)

In [None]:
# Loading in Data (skip this cell if already done)
import re
import pandas as pd

def preprocess_quote(quote):
    # Replace missing spaces with space
    preprocessed_quote = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", quote)

    # ...
    # add more if necessary

    return preprocessed_quote

df = pd.read_excel("data/qadata.xlsx")
df[["name", "party"]] = df.username.str.split(" - ", expand=True).apply(
    lambda x: x.str.strip()
)
df = df.drop(["username"], axis=1)

# taking only a portion of the data for now
category_list = [
    "Abortion, Pro-Life & Genetic Engineering",
    "Crime, Police & Imprisonment",
    "Environment & Climate Change",
    "Gun & Property Rights",
    "Immigration, Border Security, Terrorism & Homeland Security",
    "Jobs, Economy, Trade, Business, Industry & Agriculture",
    "Education & Schools",
]
df = df.loc[df.parent_question.isin(category_list)]
df["answer"] = df["answer"].apply(preprocess_quote)

# removing None values in "party"
df["party"] = df["party"].apply(lambda p: p if p else "Other")

gm.transform_and_store_data(df, page_content_column="answer")

In [3]:
question = "Do you support limiting police unions' collective bargaining power for cases involving misconduct?"
genie = gm.get_genie(name)

In [4]:
genie.ask(question)

{'query': "Do you support limiting police unions' collective bargaining power for cases involving misconduct?",
 'result': {'answer': 'unknown',
  'reasoning': "There is no explicit statement in the given context that indicates whether the person supports or opposes limiting police unions' collective bargaining power for cases involving misconduct."},
 'source_documents': [{'source_content': "pattern-or-practice investigations and consent decrees to address circumstances of “systemic police misconduct” and to “restore trust between police and communities” in cities such as Ferguson. Yet, the Trump Administration’s Justice Department has limited thejoebiden.com/justice/ (08/07/2019)In the 1990s, the Biden Crime Bill added 100,000 cops to America's streets. As a result, murder and violent crime rates went down eight years in a row. George Bush's cuts to the program have put America at",
   'source_category': 'Crime, Police & Imprisonment',
   'source_sub_category': 'Police Reform'},
  {'

In [None]:
genie.get_relevant_documents(question)

In [None]:
# *** TESTING DOCUMENT TRANSFORMATION ***
# TODO: Test different text splits
# Default: Recursively split by characters

# Split by characters
# Split by tokens

In [None]:
# *** TESTING TESTING EMBEDDING ***
# TODO: Test different embedding methods

In [None]:
# *** CHANGING VECTOR STORAGE ***
# TODO: Production ready vectorstores

In [None]:
# *** TESTING RETRIEVER (relevant document) ***
# TODO: Change k for retrievers
# k = 4 (default)
# k = 2
# k = 8

# TODO: Testing MultiQueryRetriever
# https://python.langchain.com/docs/modules/data_connection/retrievers/MultiQueryRetriever
# Default MultiQuery
# Custom prompt

# TODO: Test compression retrievers
# vanilla (no compressor)
# LLMChainExtractor: run each doc through llm to extract sentences related to query
# LLMChainFilter: run each doc through llm and only keep docs that are relevant
# EmbeddingsFilter: embed each doc and only keep docs with sim score above threshold
# Stringing compressors and doc transformers: e.g. split text first then run filter

# TODO: Test Ensemble retriever
# https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble
# An addition to just by similarity (more complex basically)


In [None]:
# *** TESTING QA CHAIN ***
# TODO: Test different qa chain types
# https://docs.langchain.com/docs/components/chains/index_related_chains

In [None]:
# *** TESTING POST RESULT ***
# TODO: Test different output parsers

# default: stuff
genie_refine = gm.get_genie(name, qa_chain_type="refine") # should be second best
genie_reduce = gm.get_genie(name, qa_chain_type="map_reduce") # should be best
genie_rerank = gm.get_genie(name, qa_chain_type="map_rerank") # not applicable i believe

# TODO: Cite sources


In [None]:
# *** OTHERS ***
# TODO: Fuzzy match

# TODO: Async

# TODO: Local LLMs

# TODO: Logging
# https://python.langchain.com/docs/modules/callbacks/filecallbackhandler

# TODO: WebResearchRetriever (later when im bored lol)
# https://python.langchain.com/docs/modules/data_connection/retrievers/web_research