<a href="https://colab.research.google.com/github/FMurray/hyperdemocracy/blob/main/hyper_democracy_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [111]:
import os
import random
import re

from datasets import load_dataset
from rich import print

# Setup Keys

In [2]:
# os.environ["OPENAI_API_KEY"] = "YOUR KEY HERE"
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass()

In [3]:
%load_ext autoreload
%autoreload 2

# Load Demo Records

We are going to use a small subset of records provided by https://assembled.app/.

For the purposes of this workshop, we have created a huggingface dataset https://huggingface.co/datasets/hacdc/hyperdemocracy which we can load using the `load_dataset` function to returns a Huggingface dataset. See more info here [datasets](https://huggingface.co/docs/datasets/index) package.

In [4]:
from hyperdemocracy import load_assembly_records

In [15]:
df = load_assembly_records(process=True, strip_html=True, remove_empty_body=True)

Found cached dataset parquet (/Users/forrestmurray/.cache/huggingface/datasets/hacdc___parquet/hacdc--hyperdemocracy-0481830ea620e91d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [16]:
df.head()

Unnamed: 0,key,name,sponsors,summary,body,themes,index,actions,amendments,committees,relatedbills,cosponsors,subjects,text,titles,congress_num,legis_class,legis_num,congress_gov_url
0,118HCONRES1,Regarding consent to assemble outside the seat...,"[[C001053, Rep. Cole, Tom [R-OK-4], sponsor]]",This concurrent resolution authorizes the Spe...,[Congressional Bills 118th Congress]\n[From th...,"[Congress, Congressional operations and organi...","{'bill': {'actions': {'count': 7, 'url': 'http...","{'actions': [{'actionCode': None, 'actionDate'...","{'amendments': [], 'pagination': {'count': 0},...","{'committees': [], 'request': {'billNumber': '...","{'pagination': {'count': 0}, 'relatedBills': [...","{'cosponsors': [], 'pagination': {'count': 0, ...","{'pagination': {'count': 2}, 'request': {'bill...","{'pagination': {'count': 1}, 'request': {'bill...","{'pagination': {'count': 2}, 'request': {'bill...",118,HCONRES,1,https://www.congress.gov/bill/118th-congress/h...
1,118HCONRES10,Expressing the sense of Congress that the Unit...,"[[T000165, Rep. Tiffany, Thomas P. [R-WI-7], s...",This concurrent resolution calls on the Presi...,[Congressional Bills 118th Congress]\n[From th...,[International Affairs],"{'bill': {'actions': {'count': 4, 'url': 'http...","{'actions': [{'actionCode': 'H11100', 'actionD...","{'amendments': [], 'pagination': {'count': 0},...",{'committees': [{'activities': [{'date': '2023...,"{'pagination': {'count': 0}, 'relatedBills': [...","{'cosponsors': [{'bioguideId': 'P000605', 'dis...","{'pagination': {'count': 1}, 'request': {'bill...","{'pagination': {'count': 1}, 'request': {'bill...","{'pagination': {'count': 2}, 'request': {'bill...",118,HCONRES,10,https://www.congress.gov/bill/118th-congress/h...
2,118HCONRES11,Providing for a joint session of Congress to r...,"[[S001176, Rep. Scalise, Steve [R-LA-1], spons...",This concurrent resolution provides for a joi...,[Congressional Bills 118th Congress]\n[From th...,"[Congress, Congressional operations and organi...","{'bill': {'actions': {'count': 10, 'url': 'htt...","{'actions': [{'actionCode': None, 'actionDate'...","{'amendments': [], 'pagination': {'count': 0},...","{'committees': [], 'request': {'billNumber': '...","{'pagination': {'count': 0}, 'relatedBills': [...","{'cosponsors': [], 'pagination': {'count': 0, ...","{'pagination': {'count': 3}, 'request': {'bill...","{'pagination': {'count': 3}, 'request': {'bill...","{'pagination': {'count': 2}, 'request': {'bill...",118,HCONRES,11,https://www.congress.gov/bill/118th-congress/h...
3,118HCONRES12,Expressing the sense of Congress that all dire...,"[[C001039, Rep. Cammack, Kat [R-FL-3], sponsor...",This concurrent resolution expresses the sens...,[Congressional Bills 118th Congress]\n[From th...,"[Foreign Trade and International Finance, Agri...","{'bill': {'actions': {'count': 5, 'url': 'http...","{'actions': [{'actionCode': 'H11000', 'actionD...","{'amendments': [], 'pagination': {'count': 0},...",{'committees': [{'activities': [{'date': '2023...,"{'pagination': {'count': 0}, 'relatedBills': [...","{'cosponsors': [{'bioguideId': 'K000380', 'dis...","{'pagination': {'count': 6}, 'request': {'bill...","{'pagination': {'count': 1}, 'request': {'bill...","{'pagination': {'count': 2}, 'request': {'bill...",118,HCONRES,12,https://www.congress.gov/bill/118th-congress/h...
4,118HCONRES13,Supporting the Local Radio Freedom Act.,"[[W000809, Rep. Womack, Steve [R-AR-3], sponso...",This concurrent resolution declares that Cong...,[Congressional Bills 118th Congress]\n[From th...,"[Science, Technology, Communications, Congress]","{'bill': {'actions': {'count': 3, 'url': 'http...","{'actions': [{'actionCode': 'H11100', 'actionD...","{'amendments': [], 'pagination': {'count': 0},...",{'committees': [{'activities': [{'date': '2023...,"{'pagination': {'count': 1}, 'relatedBills': [...","{'cosponsors': [{'bioguideId': 'C001066', 'dis...","{'pagination': {'count': 2}, 'request': {'bill...","{'pagination': {'count': 1}, 'request': {'bill...","{'pagination': {'count': 2}, 'request': {'bill...",118,HCONRES,13,https://www.congress.gov/bill/118th-congress/h...


In [18]:
df.shape

(51, 19)

# Sponsor Graph Sidequest

[Notebook here:](https://github.com/FMurray/hyperdemocracy/blob/main/sidequests/sponsor_graph.ipynb)

# From Pandas Dataframe to Langchain Document

TODO: Document me!

TODO: Try loading directly from source with langchain HTML Document loader

Langchain makes [UnstructuredText](https://unstructured-io.github.io/unstructured/examples.html)

In [19]:
from langchain.schema import Document 

In [20]:
docs = []
for irow, row in df.iterrows():
    doc = Document(
        page_content=row['body'],
        metadata={
            'key': row['key'],
            'congress_num': row['congress_num'],
            'legis_class': row['legis_class'],
            'legis_num': row['legis_num'],
            'name': row['name'],
            'summary': row['summary'],
            'source': row['congress_gov_url'],

            # Note: chroma can only filter on float, str, or int
            # https://docs.trychroma.com/usage-guide#using-where-filters

            'sponsor': row['sponsors'][0][0],

            # TODO: figure out how to break theme list up in a better way
            'theme0': row['themes'][0] if row['themes'].size > 0 else ""
        },
    )
    docs.append(doc)

In [22]:
rich.print(doc)

In [23]:
rich.print(doc.page_content)

In [24]:
print(len(docs))

51


In [25]:
doc.metadata

{'key': '118HR2905',
 'congress_num': '118',
 'legis_class': 'HR',
 'legis_num': '2905',
 'name': 'End Prison Gerrymandering Act',
 'summary': " End Prison Gerrymandering Act This bill requires the Bureau of the Census, beginning with the 2030 decennial census, to attribute an individual incarcerated in a correctional facility or detention center to the individual's last place of residence before incarceration. Further, a state must treat such an individual's last place of residence in the state before incarceration as the individual's place of residence for purposes of congressional redistricting.",
 'source': 'https://www.congress.gov/bill/118th-congress/house-bill/2905',
 'sponsor': 'R000305',
 'theme0': 'Government Operations and Politics'}

# Document QA Quickstart

https://python.langchain.com/en/latest/modules/indexes/getting_started.html

TODO: What is DocumentQA? 

# Introducing indexes

TODO: What are indexes?

In [26]:
from langchain.indexes import VectorstoreIndexCreator

In [27]:
index = VectorstoreIndexCreator().from_documents(docs)

In [28]:
index

VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x28eba6950>)

In [29]:
query = "What are the primary themes around energy policy?"

Try some other questions for yourself!

In [30]:
out = index.query(query)
out

' The primary themes around energy policy are reducing carbon emissions, embracing and accepting nuclear power as a clean baseload energy source, boosting the renewable energy economy, and avoiding overly restrictive regulations on the exploration, production, or marketing of energy resources.'

In [31]:
out = index.query_with_sources(query)

In [33]:
rich.print(out)

In [34]:
print(out['sources'])

https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/37
https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/17
https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/26


In [35]:
query = "Describe in 100 words the proposed solutions to climate change?"
out = index.query_with_sources(query)
out

{'question': 'Describe in 100 words the proposed solutions to climate change?',
 'answer': ' Solutions to climate change proposed by the United States Congress include a just phase-out of the use of oil, gas, and coal, rapid and immediate acceleration and proliferation of far-reaching, multilevel, and cross-sectoral climate mitigation, and equitable policies to address the climate crisis. These solutions are intended to limit warming through emissions reduction and carbon sequestration, and to address the consequences of climate change such as a significant rise in sea levels, extraordinary loss of biodiversity, and intensifying droughts, floods, wildfires, and other extreme weather events.\n',
 'sources': 'https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/37'}

# Step by step explanation of the DocumentQA

## Langchain Text Splitters

https://python.langchain.com/en/latest/modules/indexes/text_splitters.html

https://simonwillison.net/2023/Jun/8/gpt-tokenizers/

In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter

In [49]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
split_docs = text_splitter.split_documents(docs)
split_docs_token = TokenTextSplitter(chunk_size=512, chunk_overlap=128).split_documents(docs)
# split_docs_token = TokenTextSplitter().split_documents(split_docs)

In [50]:
print("The length before splitting: ", len(docs))
print("The length after splitting: ", len(split_docs))
print("Token based splitter: ", len(split_docs_token))

The length before splitting:  51
The length after splitting:  761
Token based splitter:  248


In [55]:
print("a single doc chunk content length from recursive char based splitter: ", len(split_docs[0].page_content))
print("a single doc chunk content length from token based splitter: ", len(split_docs_token[0].page_content))

a single doc chunk from recursive char based splitter:  363
a single doc chunk from token based splitter:  1383


TODO drill in on chunk size and chunk overlap

## Embed and Index Doc Chunks

# Intro To Embdeddings

[Link to Notebook]("https://github.com/FMurray/hyperdemocracy/blob/main/sidequests/embeddings.ipynb")


## Index Embeddings in a Vector Database

In [61]:
from langchain.vectorstores import Chroma

In [62]:
db = Chroma.from_documents(split_docs, embeddings)

In [63]:
db

<langchain.vectorstores.chroma.Chroma at 0x287c78310>

In [84]:
# explnain similarity types, cosine, inner-product, squared L2, 
# looks like chroma uses hnswlib which supports 3 distances (default cosine) [TODO confirm default]
# https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/chroma.py
# https://docs.trychroma.com/usage-guide#changing-the-distance-function
# https://github.com/nmslib/hnswlib/tree/master#supported-distances

# in addition langchain offers maximal marginal relevance on top of cosine
# https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/utils.py#L10

ret_docs = db.similarity_search_with_score(
    "nuclear power", 
    k=10, 
    filter={"source": "https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/17"},
)

for doc in ret_docs:
    print(doc)

(Document(page_content='petroleum products under the Energy Policy and Conservation Act \n        (42 U.S.C. 6201 et seq.).\n                                 <all>', metadata={'key': '118HCONRES17', 'congress_num': '118', 'legis_class': 'HCONRES', 'legis_num': '17', 'name': 'Expressing the sense of Congress that the Federal Government should not impose any restrictions on the export of crude oil or other petroleum products.', 'summary': '', 'source': 'https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/17', 'sponsor': 'G000558', 'theme0': 'Energy'}), 0.43378937244415283)
(Document(page_content='of the United States as a global energy superpower: Now, therefore, be \n        it\n    Resolved by the House of Representatives (the Senate concurring), \nThat it is the sense of Congress that the Federal Government should not \nimpose--\n            (1) overly restrictive regulations on the exploration, \n        production, or marketing of energy resources; or\n         

In [85]:
# show that this is all the docs from filter
len([d for d in split_docs if d.metadata['source']=='https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/17'])

8

# What are retrievers?

TODO: TL;DR 

In [86]:
import langchain
langchain.verbose = False

In [87]:
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI

In [88]:
retriever = db.as_retriever(search_kwargs={'k':10})

In [89]:
retriever

VectorStoreRetriever(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x287c78310>, search_type='similarity', search_kwargs={'k': 10})

Compare the chains in the original DocumentQA quickstart with the chains here

In [102]:
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(), 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
)

# Questions

* what are the components of the RetrievalQA chain?
* what is the QA prompt?
* how would you modify the QA prompt?
* what is the difference between the following qa chain types?,
    * stuff
    * map_reduce
    * map_rerank
    * refine

# Resources

* https://github.com/hwchase17/langchain/tree/master/langchain/chains/retrieval_qa
* https://github.com/hwchase17/langchain/tree/master/langchain/chains/question_answering

In [None]:
# WARNING! Do not commit the outputs of this cell if it contains your API key

rich.print(qa)

## How many ways can we print a prompt? 

In [103]:
prompt_template = qa.combine_documents_chain.llm_chain.prompt
prompt_template

PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:", template_format='f-string', validate_template=True)

In [104]:
print(prompt_template.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [105]:
import textwrap

In [107]:
rich.print(prompt_template.format(context='[CONTEXT]', question='[QUESTION]'))

In [108]:
answer = qa("What is the solution to climate change?")

In [109]:
answer.keys()

dict_keys(['query', 'result', 'source_documents'])

In [113]:
print(answer['result'])

In [114]:
qaws = RetrievalQAWithSourcesChain.from_chain_type(
    llm=OpenAI(), 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
)

In [None]:
# WARNING! Do not commit the outputs of this cell if it contains your API key
print(qaws)

In [116]:
pt = qaws.combine_documents_chain.llm_chain.prompt

In [117]:
print(pt.format(summaries='[SUMMARIES]', question='[QUESTION]'))

In [118]:
answer = qaws("What is the solution to climate change?")

In [119]:
answer.keys()

dict_keys(['question', 'answer', 'sources', 'source_documents'])

In [122]:
print(answer['answer'])
print(answer['sources'])

# Prompt Construction Sidequest

# TODO

Try alternatives to stuff

Figure out how to pass all the options to the high level constructor. 

https://github.com/hwchase17/langchain/blob/master/langchain/indexes/vectorstore.py

In [None]:
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Chroma, 
    embedding=OpenAIEmbeddings(),
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
)


Sticking this here to decide if we want to use this in the course content

https://xml.house.gov/

TODO: Sidequest on implementing a langchain document loader using this XML schema ^^^

https://www.everycrsreport.com/

# Lets make it a conversation

https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html

In [123]:
from langchain.chains import ConversationalRetrievalChain

In [124]:
# TODO cover serializing the db to disk
db

<langchain.vectorstores.chroma.Chroma at 0x287c78310>

In [135]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [136]:
qachat = ConversationalRetrievalChain.from_llm(
    OpenAI(temperature=0), 
    db.as_retriever(), 
    memory=memory
)

In [137]:
query = "What is the solution to climate change?"
answer = qachat(query)

In [138]:
print(answer)

In [139]:
follow_up = "How certain is the 350 number?"
result = qachat({"question": follow_up})

In [140]:
print(result)