In [10]:
import numpy as np
import pandas as pd

from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFacePipeline
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.schema import HumanMessage, AIMessage

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM,  BitsAndBytesConfig, TextStreamer

import torch

import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# Embeddings
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    #model_kwargs=model_kwargs,
    #encode_kwargs=encode_kwargs,
    cache_folder = './hf'
)


In [3]:
db = FAISS.load_local("faiss_index", embeddings=embeddings, allow_dangerous_deserialization=True)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [15]:
rephrase_prompt = hub.pull("langchain-ai/chat-langchain-rephrase", api_key=os.environ['LANGSMITH_API_KEY'], api_url="https://api.smith.langchain.com/")
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat", api_key=os.environ['LANGSMITH_API_KEY'], api_url="https://api.smith.langchain.com/")
print("Rephrase prompt: ", rephrase_prompt)
print("\n")
print("Retrieval QA prompt: ", retrieval_qa_chat_prompt)


Rephrase prompt:  input_variables=['chat_history', 'input'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'langchain-ai', 'lc_hub_repo': 'chat-langchain-rephrase', 'lc_hub_commit_hash': 'fb7ddb56be11b2ab10d176174dae36faa2a9a6ba13187c8b2b98315f6ca7d136'} template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.\n\nChat History:\n{chat_history}\nFollow Up Input: {input}\nStandalone Question:'


Retrieval QA prompt:  input_variables=['context', 'input'] optional_variables=['chat_history'] input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.funct

In [16]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config, device_map="auto", token=os.environ['hf_token'])
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ['hf_token'])
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

pipe = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200, streamer=streamer)

llm = HuggingFacePipeline(model_id=model_name, pipeline=pipe)
llm = ChatHuggingFace(llm=llm)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:1


In [17]:
# Create Chain
# Create Document Stuff Chain
stuff_documents_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
# Create History aware Chain
history_aware_retriever = create_history_aware_retriever(llm=llm, retriever=retriever, prompt=rephrase_prompt)
# Create Retrieval Chain
qa = create_retrieval_chain(retriever=history_aware_retriever, combine_docs_chain=stuff_documents_chain)

In [18]:
query = "What is Chain of Thought?"
chat_history = []

res = qa.invoke(input={'input': query, 'chat_history': chat_history})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Chain-of-Thought (CoT) is a problem-solving approach where a large language model is prompted to generate its answer following a step-by-step explanation. This approach encourages the model to articulate a reasoning process leading to the final answer, supported by in-context demonstrations. It involves decomposing intricate problems into manageable steps, simplifying the overall reasoning process, and creating a linkage among the reasoning steps to ensure no important conditions are overlooked.


In [19]:
def extract_answer(response):
    parts = response.split("<|start_header_id|>assistant<|end_header_id|>")
    return parts[-1].strip() if len(parts) > 1 else response.strip()


In [20]:
chat_history.append(HumanMessage(content=query)) 
chat_history.append(AIMessage(content=extract_answer(res['answer']))) 
chat_history

[HumanMessage(content='What is Chain of Thought?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Chain-of-Thought (CoT) is a problem-solving approach where a large language model is prompted to generate its answer following a step-by-step explanation. This approach encourages the model to articulate a reasoning process leading to the final answer, supported by in-context demonstrations. It involves decomposing intricate problems into manageable steps, simplifying the overall reasoning process, and creating a linkage among the reasoning steps to ensure no important conditions are overlooked.', additional_kwargs={}, response_metadata={})]

In [21]:
query = "What are the benefits?"
res = qa.invoke(input={'input': query, 'chat_history': chat_history})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What are the benefits of Chain of Thought (CoT)?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


According to the text, the benefits of Chain-of-Thought (CoT) prompting include:

1. Dramatically improved performance on reasoning-heavy tasks.
2. Ability to elicit superior reasoning abilities in a few-shot setting.
3. Capable of performing well in zero-shot setting with minimal modifications to the prompt.
4. Enables the model to articulate a step-by-step reasoning process, providing a clear explanation for the predicted answer.
5. Improves the interpretability of the model's reasoning process.

However, the text also mentions that the intermediate steps in traditional CoT prompting hardly provide meaningful information due to deficiencies in prompt design.


In [22]:
query = "What did i ask you before?"
res = qa.invoke(input={'input': query, 'chat_history': chat_history})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What was the topic of the previous conversation?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


You asked "What is Chain of Thought?"


In [23]:
query = "What is Tree of Thought?"
res = qa.invoke(input={'input': query, 'chat_history': chat_history})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is the difference between Chain of Thought and Tree of Thought approaches in problem-solving?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Tree-of-Thought (ToT) is an approach that enables large language models to explore multiple reasoning pathways through a decision-making tree structure. This structure incorporates backtracking via a search algorithm to identify the globally optimal reasoning path. It allows the model to explore different possibilities and select the best thinking path to arrive at the solution.
