In [5]:
## pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113

### Installing libraries

In [25]:
!pip install -q langchain
!pip install -q langchain_community
!pip install -q sentence_transformers
!pip install -q bitsandbytes
!pip install -q accelerate

^C


In [1]:
import os
import warnings
warnings.filterwarnings("ignore")
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### Importing libraries

In [2]:
import torch
if torch.cuda.is_available():
    print("CUDA is available!")
else:
    print("CUDA is not available.")

CUDA is available!


In [3]:
from langchain.llms import CTransformers

In [4]:
llm = CTransformers(model= "../model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type= 'llama',
                    config={'max_new_tokens': 600,
                              'temperature': 0.01,
                              'context_length': 5000})

In [5]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('BAAI/bge-base-en-v1.5', cache_folder=".")

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

In [7]:
model_path= "../models--BAAI--bge-base-en-v1.5\\snapshots\\a5beb1e3e68b9ab74eb54cfd186867f64f240e1a"

In [8]:
# Create a dictionary with model configuration options, specifying to use the CPU/GPU for computations
model_kwargs = {'device':'cuda'} #model_kwargs = {'device':'cpu'}

In [9]:
# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=model_path,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

  embeddings = HuggingFaceEmbeddings(


In [10]:
llm.invoke("What is RAG??")

'\n Unterscheidung between RAG and Agile?\nRAG stands for "Risks, Assumptions, and Gates". It is a tool used in project management to identify, track, and manage risks, assumptions, and milestones in a project. RAG status is typically used in conjunction with Agile methodologies, but it can also be used in other project management frameworks.\n\nRAG is often used in Agile projects to help teams prioritize and manage their work. It provides a simple and visual way to categorize tasks based on their level of risk or uncertainty. Tasks are assigned a RAG status, which can be either Green (low risk), Amber (medium risk), or Red (high risk). This helps teams identify the most critical tasks and allocate resources accordingly.\n\nHere are some key differences between RAG and Agile:\n\n1. Focus: RAG is focused specifically on risk management, while Agile is a broader project management framework that encompasses various aspects of project delivery, including planning, execution, and monitorin

In [11]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

### using **custom** dataset

#### RecursiveCharacterTextSplitter is a text splitter that splits the text into chunks, trying to keep paragraphs togeher and avoid loosing context over pages

In [12]:
pdf_reader = PyPDFLoader("../data\RAGPaper.pdf")
documents = pdf_reader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [40]:
HF_TOKEN = ""

In [13]:
from langchain.vectorstores import FAISS

# Create embeddings
# embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN,
#                                                model_name="BAAI/bge-base-en-v1.5")
db = FAISS.from_documents(documents=chunks, embedding=embeddings)

# FAISS: Facebook AI Similarity Search --> Powerful library for similarity search and clustering of dense vectors

In [14]:
template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Follow up Input: {question}
Standalone questions: """

In [15]:
from langchain.prompts import PromptTemplate
CONDENSE_QUESTION_PROMPT = PromptTemplate(template=template, input_variables=["question"])

In [16]:
from langchain.chains import ConversationalRetrievalChain

qa = ConversationalRetrievalChain.from_llm(llm=llm,retriever=db.as_retriever(),condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                                           return_source_documents=True, verbose=False)

In [17]:
qa

ConversationalRetrievalChain(combine_docs_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=CTransformers(client=<ctransformers.llm.LLM object at 0x0000020C4DFE0D60>, model='../model\\llama-2-7b-chat.ggmlv3.q4_0.bin', model_type='llama', config={'max_new_tokens': 600, 'temperature': 0.01, 'context_length': 5000})), document_variable_name='context'), question_generator=LLMChain(prompt=PromptTemplate(input_variables=['chat_history', 'question'], template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.\nChat History:\n{chat_history}\nFollow up Input: {question}\nStandalone questions: '), llm=CTransformers(client=<ctransformers.llm

### Ask a query

In [81]:
chat_history=[]
query="""Who is Sachin Tendulkar"""
result = qa({"question":query,"chat_history":chat_history})
print(result["answer"])

 Sachin Ramesh Tendulkar (born April 24, 1973) is a former Indian cricketer and captain who is widely regarded as one of the greatest batsmen in the history of cricket. He was born in Mumbai, India, and made his first-class debut in 1989. Tendulkar scored over 34,000 runs in international cricket, including 15,921 runs in Test cricket, which is the most by any player in history. He also holds several other records, including most centuries scored in Test cricket (51) and most runs scored in a single World Cup edition (673). Tendulkar was named the ICC Cricketer of the Year in 2010, and he was awarded the Bharat Ratna, India's highest civilian honor, in 2014.


In [18]:
chat_history=[]
query="""What is RAGs and tell me more about use cases of RAGs, in a detailed manner"""
result = qa.invoke({"question":query,"chat_history":chat_history})
print(result["answer"])

 RAGs stands for Retrieval-based Autoencoder with Generative model, which is a type of neural network architecture that combines the strengths of both retrieval-based models and generative models. The basic idea behind RAGs is to use a retriever to retrieve relevant text documents from a large corpus, and then use these documents as additional context when generating the target sequence. This allows the model to learn how to generate high-quality text that is relevant to the input sequence, rather than simply relying on generic language models.

One of the main advantages of RAGs is that it can be used for a wide range of natural language processing tasks, such as text generation, language translation, and question answering. For example, in text generation, RAGs can be trained to generate coherent and contextually relevant text by using the retrieved documents as additional context. In language translation, RAGs can be used to translate text from one language to another while also tak

In [21]:
chat_history

[]

In [19]:
from langchain_core.messages import HumanMessage, AIMessage

In [22]:
chat_history.extend(
    [
        HumanMessage(content= query),
        AIMessage(content=result["answer"])
    ]
)

In [23]:
chat_history

[HumanMessage(content='What is RAGs and tell me more about use cases of RAGs, in a detailed manner'),
 AIMessage(content=' RAGs stands for Retrieval-based Autoencoder with Generative model, which is a type of neural network architecture that combines the strengths of both retrieval-based models and generative models. The basic idea behind RAGs is to use a retriever to retrieve relevant text documents from a large corpus, and then use these documents as additional context when generating the target sequence. This allows the model to learn how to generate high-quality text that is relevant to the input sequence, rather than simply relying on generic language models.\n\nOne of the main advantages of RAGs is that it can be used for a wide range of natural language processing tasks, such as text generation, language translation, and question answering. For example, in text generation, RAGs can be trained to generate coherent and contextually relevant text by using the retrieved documents as