In [None]:
!pip install chromadb
!pip install langchain
!pip install pypdf
!pip install sentencepiece
!pip install -U bitsandbytes
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/peft.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/UKPLab/sentence-transformers.git
!pip install git+https://github.com/Muennighoff/sentence-transformers.git@sgpt_poolings_specb
!pip install --upgrade git+https://github.com/UKPLab/sentence-transformers.git
!pip install -U sentence-transformers

In [15]:
import os
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16
)

model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config=quantization_config)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

  return self.fget.__get__(instance, owner)()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
!mkdir data
!wget --user-agent "Mozilla" "https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf" -O "data/power.pdf"

--2024-01-29 14:38:02--  https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf
Resolving pgcag.files.wordpress.com (pgcag.files.wordpress.com)... 192.0.72.25, 192.0.72.24
Connecting to pgcag.files.wordpress.com (pgcag.files.wordpress.com)|192.0.72.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104926 (102K) [application/pdf]
Saving to: 'data/power.pdf'


2024-01-29 14:38:03 (1.43 MB/s) - 'data/power.pdf' saved [104926/104926]



In [22]:
pdf_link = "/kaggle/input/dataset2/Asimov Issac - The Foundation Trilogy.pdf"
loader = PyPDFLoader(pdf_link, extract_images=False)
pages = loader.load_and_split()


# Split data into chunks
text_splitter = RecursiveCharacterTextSplitter(
   chunk_size = 4000,
   chunk_overlap  = 20,
   length_function = len,
   add_start_index = True,
)
chunks = text_splitter.split_documents(pages)

In [23]:
# Store data into database
db=Chroma.from_documents(chunks,embedding=embeddings,persist_directory="test_index")
db.persist()

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [24]:
# Load the database
vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings)

# Load the retriver
retriever = vectordb.as_retriever(search_kwargs = {"k" : 3})
qna_prompt_template="""### [INST] Instruction: You will be provided with questions and related data. Your task is to find the answers to the questions using the given data. If the data doesn't contain the answer to the question, then you must return 'Not enough information.'

{context}

### Question: {question} [/INST]"""

PROMPT = PromptTemplate(
   template=qna_prompt_template, input_variables=["context", "question"]
)
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)

In [25]:
# A utility function for answer generation
def ask(question):
   context = retriever.get_relevant_documents(question)
#    print(context)

   answer = (chain({"input_documents": context, "question": question}, return_only_outputs=True))['output_text']
   return answer

In [26]:
# Take the user input and call the function to generate output
user_question = "Can you tell me about The Empire?"
answer = ask(user_question)
print("Answer:", answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Answer:  The Empire is a powerful and vast interstellar civilization in the science fiction novel "Foundation" by Isaac Asimov. It is depicted as an authoritarian and imperialistic society that has reached out to the outermost edge of the Foundation's influence, marking the beginning of a war. The Empire's forces are described as leaving garrisons on various worlds, with men in Imperial uniform bearing the Spaceship-and-Sun insignia. The old men on these worlds remember the tales of their grandfathers' fathers, when the universe was big, rich, and peaceful, and the Empire ruled all. The Empire's territories are represented by gold stars on a map, and its forces have taken over areas marked by deepening blue stars. Bel Riose, the general leading the Empire's forces, reports back to him as each world is knotted into its proper place in the fabric of the Empire.
