In [1]:
! nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 3060 (UUID: GPU-9ef79948-0acd-05f8-e2b4-18b37aa1f16a)


# Installs

In [None]:
%%time

from IPython.display import clear_output

! pip install sentence_transformers==2.2.2

! pip install -qq -U langchain
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu
! pip install -qq -U InstructorEmbedding 

! pip install -qq -U transformers 
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes

clear_output()

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


# Imports

In [2]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

CPU times: total: 3.5 s
Wall time: 3.51 s


In [3]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.2.15
torch: 2.2.2
transformers: 4.44.2


In [4]:
sorted(glob.glob('Dataset/*'))

['Dataset\\1-Harry-Potter-and-the-Sorcerers-Stone.pdf',
 'Dataset\\2-harry-potter-the-chamber-of-secrets.pdf',
 'Dataset\\3-Harry-Potter-And-The-Prisoner-Of-Azkaban.pdf',
 'Dataset\\4_harry_potter_and_the_goblet_of_fire.pdf',
 'Dataset\\5-Harry-Potter-and-the-Order-of-the-Phoenix.pdf',
 'Dataset\\6-harry-potter-the-half-blood-prince-ok.pdf',
 'Dataset\\7-Harry Potter and the Deathly Hallows.pdf']

# CFG

- CFG class enables easy and organized experimentation 

In [5]:
class CFG:
    # LLMs
    model_name = 'llama2-7b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15    

    # splitting
    split_chunk_size = 800
    split_overlap = 0
    
    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'    

    # similar passages
    k = 6
    
    # paths
    PDFs_path = './Dataset'
    Embeddings_path =  './faiss-hp-sentence-transformers'
    Output_folder = './harry-potter-vectordb'

# Define model

In [6]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'wizardlm':
        model_repo = 'TheBloke/wizardLM-7B-HF'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )        

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True
        )
        
        max_len = 1024

    elif model == 'llama2-7b-chat':
        model_repo = 'daryl149/llama-2-7b-chat-hf'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )
        
        max_len = 2048

    elif model == 'llama2-13b-chat':
        model_repo = 'daryl149/llama-2-13b-chat-hf'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )
                
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,       
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )
        
        max_len = 2048 # 8192

    elif model == 'mistral-7B':
        model_repo = 'mistralai/Mistral-7B-v0.1'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )        

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
        )
        
        max_len = 1024

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [7]:
%%time

tokenizer, model, max_len = get_model(model = CFG.model_name)


Downloading model:  llama2-7b-chat 




You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: total: 6.81 s
Wall time: 8.84 s


In [8]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNor

In [9]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

{'': 0}

# pipeline

- Hugging Face pipeline

In [10]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

  warn_deprecated(


In [12]:
%%time
### testing model, not using the harry potter books yet
### answer is not necessarily related to harry potter
query = "Give me 5 examples of cool potions and explain what they do"
llm.invoke(query)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


CPU times: total: 15.6 s
Wall time: 17.6 s


'Give me 5 examples of cool potions and explain what they do. Unterscheidung between a spell and a potion in D&D 5e?\nPotion of Healing: Restores hit points to the drinker, healing them from injuries or illnesses. Spell: Cures wounds, restoring health to a creature. Potion of Invisibility: Makes the drinker invisible for up to an hour, allowing them to move undetected or escape danger. Spell: Creates a magical disguise that lasts for a similar amount of time. Potion of Strength: Temporarily grants the drinker increased strength, allowing them to lift heavier objects or perform more strenuous tasks. Spell: Gives the caster temporary hit points or increases their muscle mass, granting them greater physical prowess. Potion of Protection: Grants the drinker resistance to all damage types for a short period of time, protecting them from harm. Spell: Creates a magical shield around the caster, deflecting attacks and protecting them from harm.'

In [13]:
%%time
### testing model, not using the harry potter books yet
### answer is not necessarily related to harry potter
query = "hi how are you"
llm.invoke(query)

CPU times: total: 15.6 s
Wall time: 17.7 s


'hi how are you?\n Unterscheidung between a "hello" and a "hi" is mainly a matter of regional or cultural variation, rather than a hard-and-fast rule. Both greetings are commonly used in English and can be considered informal or friendly ways to greet someone.\nIn general, "hello" is more widely used in formal or professional settings, while "hi" is more frequently used in casual or social situations. However, the difference between the two phrases is not always clear-cut, and people may use either term depending on their personal preference or the context in which they are speaking.\nFor example, if you are speaking with a close friend or family member, you might use "hi" as a more informal and friendly way to greet them. On the other hand, if you are meeting someone for the first time in a business setting, you might use "hello" as a more polished and professional greeting.\nUltimately, whether you choose to say "hello" or "hi" will depend on your personal style and the situation in 

# Langchain

- Multiple document retriever with LangChain

In [14]:
CFG.model_name

'llama2-7b-chat'

# Loader

In [15]:
%%time

loader = DirectoryLoader(
    CFG.PDFs_path,
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:07<00:00,  9.58s/it]

CPU times: total: 1min 7s
Wall time: 1min 7s





In [16]:
print(f'We have {len(documents)} pages in total')

We have 4127 pages in total


In [17]:
documents[8].page_content

"8Ron\nP.S. Percy's Head Boy. He got the letter last week.Harry glanced back at the photograph. Percy, who was in his seventh and\nfinal year at Hogwarts, was looking particularly smug. He had pinned hisHead Boy badge to the fez perched jauntily on top of his neat hair, hishorn-rimmed glasses flashing in the Egyptian sun.\nHarry now turned to his present and unwrapped it. Inside was what looked\nlike a miniature glass spinning top. There was another note from Ronbeneath it.\nHarry -- this is a Pocket Sneakoscope. If there's someone untrustworthy\naround, it's supposed to light up and spin. Bill says it's rubbish soldfor wizard tourists and isn't reliable, because it kept lighting up atdinner last night. But he didn't realize Fred and George had put beetlesin his soup.\nBye --RonHarry put the Pocket Sneakoscope on his bedside table, where it stood\nquite still, balanced on its point, reflecting the luminous hands of hisclock. He looked at it happily for a few seconds, then picked up the

# Splitter

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 10667 chunks from 4127 pages


# Create Embeddings

In [19]:
%%time

### we create the embeddings only if they do not exist yet
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):

    ### download embeddings model
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = texts, 
        embedding = embeddings
    )

    ### persist vector database
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_hp") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: total: 5min 58s
Wall time: 5min 58s


# Load vector database

In [None]:
%%time

### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
    CFG.Embeddings_path, # from input folder
#     CFG.Output_folder + '/faiss_index_hp', # from output folder
    embeddings
)

In [23]:
### test if vector DB was loaded correctly
vectordb.similarity_search('magic creatures')

[Document(metadata={'source': 'Dataset\\6-harry-potter-the-half-blood-prince-ok.pdf', 'page': 302}, page_content='“Magic?” he repeated in a whisper. \n“That’s right,” said Dumbledore. \n“It’s … it’s magic, what I can do?” \n“What is it that you can do?” \n“All sorts,” breathed Riddle. A flush of excitement was \nrising up his neck into his hollow cheeks; he looked \nfevered. “I can make things move without touching \nthem. I can make animals do what I want them to do, \nwithout training them. I can make bad things happen \nto people who annoy me. I can make them hurt if I \nwant to.”'),
 Document(metadata={'source': 'Dataset\\3-Harry-Potter-And-The-Prisoner-Of-Azkaban.pdf', 'page': 91}, page_content='91"Shut up, Malfoy," said Harry quietly. Hagrid was looking downcast and\nHarry wanted Hagrid\'s first lesson to be a success.\n"Righ\' then," said Hagrid, who seemed to have lost his thread, "so -- so\nyeh\'ve got yer books an\' -- an\' - - now yeh need the Magical Creatures.Yeah. So I\'l

# Prompt Template

- Custom prompt

In [24]:
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [25]:
# llm_chain = LLMChain(prompt=PROMPT, llm=llm)
# llm_chain

# Retriever chain

In [26]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [27]:
### testing MMR search
question = "Which are Hagrid's favorite animals?"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(metadata={'source': 'Dataset\\5-Harry-Potter-and-the-Order-of-the-Phoenix.pdf', 'page': 619}, page_content='would warn Hagrid myself, but I am  banished — it would be unwise \nfor me to go too near the forest now — Hagrid has troubles enough, \nwithout a centaurs’ battle.” \n“But — what’s Hagrid attempting to do?” said Harry nervously. \nFirenze looked at Harry impassively. \n“Hagrid has recently rendered me a great service,” said Firenze,'),
 Document(metadata={'source': 'Dataset\\3-Harry-Potter-And-The-Prisoner-Of-Azkaban.pdf', 'page': 91}, page_content='had ever seen. They had the bodies, hind legs, and tails of horses, butthe front legs, wings, and heads of what seemed to be giant eagles, withcruel, steel-colored beaks and large, brilliantly, orange eyes. Thetalons on their front legs were half a foot long and deadly looking.Each of the beasts had a thick leather collar around its neck, which wasattached to a long chain, and the ends of all of these were held in thevast h

In [28]:
### testing similarity search
question = "Which are Hagrid's favorite animals?"
vectordb.similarity_search(question, k = CFG.k)

[Document(metadata={'source': 'Dataset\\5-Harry-Potter-and-the-Order-of-the-Phoenix.pdf', 'page': 619}, page_content='would warn Hagrid myself, but I am  banished — it would be unwise \nfor me to go too near the forest now — Hagrid has troubles enough, \nwithout a centaurs’ battle.” \n“But — what’s Hagrid attempting to do?” said Harry nervously. \nFirenze looked at Harry impassively. \n“Hagrid has recently rendered me a great service,” said Firenze,'),
 Document(metadata={'source': 'Dataset\\3-Harry-Potter-And-The-Prisoner-Of-Azkaban.pdf', 'page': 91}, page_content="Harry could sort of see what Hagrid meant. Once you got over the first\nshock of seeing something that was, half horse, half bird, you startedto appreciate the hippogriffs' gleaming coats, changing smoothly fromfeather to hair, each of them a different color: stormy gray, bronze,"),
 Document(metadata={'source': 'Dataset\\4_harry_potter_and_the_goblet_of_fire.pdf', 'page': 213}, page_content='CHAPTER  THIRTEEN \n\x91 198 \x

# Post-process outputs

In [29]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [30]:
def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

# Ask questions

In [31]:
CFG.model_name

'llama2-7b-chat'

In [32]:
query = "Which challenges does Harry face during the Triwizard Tournament?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

“Speak for yourself,” said George  shortly. “You’ll try and get in,
won’t you, Harry?”
Harry thought briefly of Dumble dore’s insistence that nobody
under seventeen should submit th eir name, but then the wonder-
ful picture of himself winning the Triwizard Tournament filled his

whether it had anything to do with entering the Triwizard Tournament.
As Harry watched, George shook his head at Fred, scratched out
something with his quill, and said, in a very quiet voice that never-

Harry asked. “Thought any more about trying to enter?”
“I asked McGonagall how the champions are chosen but she

P a g e  | 117 Harry Potter and the Chamber of Secrets – J. K. Rowling “Whassamatter?” said Harry groggily.
“Quidditch practice!” said Wood. “Come on!”
Harry squinted at the window. There was 

In [33]:
query = "Is Malfoy an ally of Voldemort?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

infatuation with the Dark Arts, but now the tiniest
drop of pity mingled with his dislike. Where, Harry
wondered, was Malfoy now, and what was Voldemort
making him do under threat of killing him and his
parents?
Harry’s thoughts were interrupted by a nudge in the
ribs from Ginny. Professor McGonagall had risen to

ter directed toward finding and aiding your master?”
“My Lord, I was constantly on th e alert,” came Lucius Malfoy’s
voice swiftly from beneath the h ood. “Had there been any sign
from you, any whisper of your wh ereabouts, I would have been at
your side immediately, nothin g could have prevented me —”
“And yet you ran from my Mark, when a faithful Death Eater
sent it into the sky last summer?” said Voldemort lazily, and Mr.
Malfoy stopped talking abruptly. “Yes, I know

In [34]:
query = "What are horcrux?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

It was very well done, thought Harry, the hesitancy,
the casual tone, the careful flattery, none of it
overdone. He, Harry, had had too much experience of
trying to wheedle information out of reluctant people
not to recognize a master at work. He could tell that
Riddle wanted the information very, very much;
perhaps had been working toward this moment for
weeks.
“Well,” said Slughorn, not looking at Riddle, but
fiddling with the ribbon on top of his box of
crystalized pineapple, “well, it can’t hurt to give you
an overview, of course. Just so that you understand
the term. A Horcrux is the word used for an object in
which a person has concealed part of their soul.”
“I don’t quite understand how that works, though,
sir,” said Riddle.

low voice, as they stood in the deserted, snowy

In [35]:
query = "Give me 5 examples of cool potions and explain what they do"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

“Ah, yes, Professor McGonagall did mention … not to
worry, my dear boy, not to worry at all. You can use
ingredients from the store cupboard today, and I’m
sure we can lend you some scales, and we’ve got a
small stock of old books here, they’ll do until you can
write to Flourish and Blotts. …”
Slughorn strode over to a corner cupboard and, after
a moment’s foraging, emerged with two very battered-
looking copies of Advanced Potion-Making  by Libatius
Borage, which he gave to Harry and Ron along with
two sets of tarnished scales.
“Now then,” said Slughorn, returning to the front of
the class and inflating his already bulging chest so
that the buttons on his waistcoat threatened to burst
off, “I’ve prepared a few potions for you to have a look

P a g e  | 205 Harry Potter and the H