In [1]:

# import libraries
import os
from dotenv import load_dotenv
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory


from langchain_openai import ChatOpenAI
#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
from tqdm import tqdm

load_dotenv()  # Load OPENAI_API_KEY from .env

True

In [2]:
## log traces with Langsmith
##export LANGSMITH_TRACING="true"
LANGSMITH_PROJECT="consumer-complaint-chatbot"
# openai and langsmith api keys set in .env

In [5]:
# Load complaint data
## define function to lad data from CSV file
def csv_loader(file_path):
    loader = CSVLoader(file_path=file_path)
    documents = loader.load()
    return documents


file_path = "../data/complaints.csv"
documents = csv_loader(file_path)

# documents = loader.load()
print(f"Loaded {len(documents)} documents.")



Loaded 3472988 documents.


In [6]:
# use small chunk size for short issue statements (no overlap)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500
)

chunks = splitter.split_documents(documents)

print(f"Split into {len(chunks)} chunks.")


Split into 4704862 chunks.


In [None]:
# create Chroma vector store 
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore_path = "../chroma_db"

vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory=vectorstore_path)


In [9]:
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4o-mini")

NameError: name 'vectorstore' is not defined

In [None]:
# incorporate the retriever into a question-answering chain.
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
## contextualize the question by reformulating the question to reference to any 
# information in the historical information.

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
# retreiver becomes history aware
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)



In [None]:

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
conversational_rag_chain.invoke(
    {"input": "Are there complaints involving bank of america mortgages?"},
    config={
        "configurable": {"session_id": "1"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

In [None]:
conversational_rag_chain.invoke(
    {"input": "What issues are people complaining about?"},
    config={"configurable": {"session_id": "1"}},
)["answer"]