In [1]:

# import libraries
import os
from dotenv import load_dotenv
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage


from langchain_openai import ChatOpenAI
#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
from tqdm import tqdm

load_dotenv()  # Load OPENAI_API_KEY from .env

True

### set langsmith for observability

In [2]:
## log traces with Langsmith
##export LANGSMITH_TRACING="true"
LANGSMITH_PROJECT="consumer-complaint-chatbot"
# openai and langsmith api keys set in .env

### load document

In [3]:
# Load complaint data
loader = CSVLoader(
    file_path="../data/complaints.csv",
    # source_column="Issue",
    # metadata_columns=["Product", "Company", "Consumer complaint narrative"]
)

documents = loader.load()
print(f"Loaded {len(documents)} documents.")



Loaded 10000 documents.


### split document into chunks

In [4]:
# use small chunk size for short issue statements (no overlap)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(documents)

print(f"Split into {len(chunks)} chunks.")


Split into 28021 chunks.


### embed and store document

In [5]:
# create Chroma vector store 
embedding = OpenAIEmbeddings()
vectorstore_path = "../chroma_db"

vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory=vectorstore_path)

# vectorstore = Chroma.from_documents(chunks, embedding=embedding, persist_directory="../chroma_db")


### retreive and generate prompt

In [6]:
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4o-mini")

# incorporate the retriever into a question-answering chain.
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [7]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [8]:
response = rag_chain.invoke({"input": "which mortgage company have poor customer service"})
response["answer"]

'The complaints mention issues with "XXXX XXXX XXXX" and "MY LOAN CAREXXXX" regarding high bills, unreceived payments, and late fees. However, specific names are not provided in the context. Therefore, I cannot definitively state which mortgage company has poor customer service based on the information given.'

### Add chat history

In [9]:
## contextualize the question by reformulating the question to reference to any 
# information in the historical information.
chat_history = []
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
# retreiver becomes history aware
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [10]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [11]:
chat_history = []

question = "Are there complaints involving bank of america mortgages?"

ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)


second_question = "What issues are people complaining about?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

People are primarily complaining about loan modification, collection, and foreclosure issues related to their mortgages. Additionally, there are complaints about loan servicing, payments, and escrow accounts.


### persisting chat history and automate input using RunnableWithMessageHistory

In [12]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory


In [13]:
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

### invoke the chain

In [14]:
conversational_rag_chain.invoke(
    {"input": "Are there complaints involving bank of america mortgages?"},
    config={
        "configurable": {"session_id": "1"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

"Yes, there are complaints involving Bank of America's mortgages, specifically related to issues like loan modification, collection, and foreclosure. Multiple complaints have been received, particularly in California and Wisconsin. Some of these complaints were closed with explanations or non-monetary relief."

In [15]:
conversational_rag_chain.invoke(
    {"input": "What issues are people complaining about?"},
    config={"configurable": {"session_id": "1"}},
)["answer"]

'People are primarily complaining about loan servicing issues, including problems with loan modifications, payment processing, and escrow account management. Other common issues involve collection practices and foreclosure proceedings. Customers have also reported difficulties with communication and customer service responses.'

### Now lets build an Agent 

#### create a retreival tool for the agent

In [16]:
from langchain.tools.retriever import create_retriever_tool

tool = create_retriever_tool(
    retriever,
    "consumer_complaints_retriever",
    "Search for information on what issues consumers are having about companies' products",
)
tools = [tool]

In [17]:
tool.invoke("mortgage fraud company")

"and/or add years to they 're mortgages. The way the company ( XXXX XXXX XXXX based in NY ) presents themselves is a scam.\n\nand/or add years to they 're mortgages. The way the company ( XXXX XXXX XXXX based in NY ) presents themselves is a scam.\n\nand/or add years to they 're mortgages. The way the company ( XXXX XXXX XXXX based in NY ) presents themselves is a scam.\n\nof mortgage fraud involves collusion by industry insiders, such as bank officers, appraisers, mortgage brokers, attorneys, loan originators, and other professionals engaged in the industry. Fraud for profit aims not to secure housing, but rather to misuse the mortgage lending process to steal cash and equity from lenders or homeowners. The FBI prioritizes fraud for profit cases. '' We owned enough points as we were already XXXX members with XXXX points, but XXXX XXXX presented a program in"

### Create Agent

In [66]:
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver

memory = MemorySaver()

agent_executor = create_react_agent(llm, tools, checkpointer=memory)