In [None]:
! nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-d00b67b4-e3be-1d3b-9ff9-10b7ae9bdb9e)


In [None]:
%%time

from IPython.display import clear_output

! pip install sentence_transformers==2.2.2 -q

! pip install -qq -U langchain -q
! pip install -qq -U tiktoken -q
! pip install -qq -U pypdf -q
! pip install -qq -U faiss-gpu -q
! pip install -qq -U InstructorEmbedding -q

! pip install -qq -U transformers -q
! pip install -qq -U accelerate -q
! pip install -qq -U bitsandbytes -q

clear_output()


CPU times: user 519 ms, sys: 67.1 ms, total: 586 ms
Wall time: 1min 7s


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

CPU times: user 8.94 s, sys: 2.43 s, total: 11.4 s
Wall time: 19.8 s


In [None]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.1.20
torch: 2.2.1+cu121
transformers: 4.40.2


In [None]:
sorted(glob.glob('/content/drive/MyDrive/Harry Potter Books/*'))

['/content/drive/MyDrive/Harry Potter Books/Harry Potter And The Chamber Of Secrets [Harry Potter 2] (J.K. Rowling) (Z-Library) - Copy.pdf',
 '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Deathly Hallows [Harry Potter 7] (J.K. Rowling) (Z-Library).pdf',
 '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Half-Blood Prince [Harry Potter 6] (J.K. Rowling) (Z-Library) - Copy.pdf',
 '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Order of the Phoenix (J.K. Rowling) (Z-Library)(#5) - Copy.pdf',
 '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Philosophers Stone (J.K. Rowling) (Z-Library) - Copy.pdf',
 '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Prisoner of Azkaban [Harry Potter 3] (J.K. Rowling) (Z-Library) - Copy.pdf',
 '/content/drive/MyDrive/Harry Potter Books/Rowling, J.K - Harry Potter 04 - The Goblet of Fire (Rowling, J.K [Rowling, J.K]) (Z-Library) - Copy.pdf']

In [None]:
class CFG:
    # LLMs
    model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 800
    split_overlap = 0

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 6

    # paths
    PDFs_path = '/content/drive/MyDrive/Harry Potter Books/'
    Embeddings_path =  '/content/faiss-hp-sentence-transformers'
    Output_folder = './harry-potter-vectordb'

In [None]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'wizardlm':
        model_repo = 'TheBloke/wizardLM-7B-HF'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True
        )

        max_len = 1024

    elif model == 'llama2-7b-chat':
        model_repo = 'daryl149/llama-2-7b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048


    elif model == 'llama2-13b-chat':
        model_repo = 'daryl149/llama-2-13b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048 # 8192

    elif model == 'mistral-7B':
        model_repo = 'mistralai/Mistral-7B-v0.1'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
        )

        max_len = 1024

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [None]:
%%time

tokenizer, model, max_len = get_model(model = CFG.model_name)

clear_output()


Downloading model:  llama2-13b-chat 




ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [None]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

{'': 0}

In [None]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f6ba02246d0>)

In [None]:
%%time
### testing model, not using the harry potter books yet
### answer is not necessarily related to harry potter
query = "Give me 5 examples of cool potions and explain what they do"
llm.invoke(query)

CPU times: user 36.1 s, sys: 284 ms, total: 36.4 s
Wall time: 39.6 s


'Give me 5 examples of cool potions and explain what they do.\n\nSure thing! Here are five examples of cool potions that you might find in a fantasy world, along with their effects:\n\n1. Potion of Healing: This potion restores health to the drinker, healing wounds and injuries. It might also grant temporary immunity to future damage or disease.\n2. Potion of Strength: This potion grants the drinker increased physical strength and endurance for a short period of time, allowing them to lift heavier objects, run faster, and fight longer.\n3. Potion of Speed: This potion allows the drinker to move at incredible speeds for a short period of time, making it easier to escape danger or chase down enemies.\n4. Potion of Invisibility: This potion makes the drinker temporarily invisible, allowing them to sneak past guards, avoid detection by monsters, or steal valuable items without being caught.\n5. Potion of Flight: This potion gives the drinker the ability to fly for a short period of time, a

In [None]:
CFG.model_name

'llama2-13b-chat'

In [None]:
%%time

loader = DirectoryLoader(
    CFG.PDFs_path,
    glob="./**/*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|██████████| 7/7 [01:57<00:00, 16.71s/it]

CPU times: user 1min 49s, sys: 0 ns, total: 1min 49s
Wall time: 1min 57s





In [None]:
print(f'We have {len(documents)} pages in total')

We have 3079 pages in total


In [None]:
documents[8].page_content

'“You have no wife,” said the cold voice, ve ry quietly. “Nobody knows you are here. You told \nnobody that you were coming. Do not lie to Lord Vo ldemort, Muggle, for he knows… he always \nknows…” \n “Is that right?” said Frank r oughly. “Lord, is it? Well, I don’t think much of your manners, My \nLord. Turn ‘round and face me like a man, why don’t you?”  “But I am not a man, Muggle,” said  the cold voice, bare ly audible now over the crackling of the \nflames. “I am much, much more than a ma n. However… why not? I will face you… Wormtail, \ncome turn my chair around.”  The servant gave a whimper.  “You heard me, Wormtail.”  Slowly, with his face screwed up, as though he w ould rather have done anything than approach \nhis master and the hearth rug where the snake lay, the small man walked forward and began to \nturn the chair. The snake lifted its ugly triangular h ead and hissed slightly as the legs of the chair \nsnagged on its rug.  And then the chair was facing Frank, and he saw 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 10113 chunks from 3079 pages


In [None]:
%%time

### we create the embeddings only if they do not exist yet
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):

    ### download embeddings model
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = texts,
        embedding = embeddings
    )

    ### persist vector database
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_hp") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: user 32.8 s, sys: 549 ms, total: 33.3 s
Wall time: 37.2 s


In [None]:
%%time

### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
#    CFG.Embeddings_path, # from input folder
    CFG.Output_folder + '/faiss_index_hp', # from output folder
    embeddings,
    allow_dangerous_deserialization=True
)

clear_output()

CPU times: user 114 ms, sys: 65.3 ms, total: 179 ms
Wall time: 180 ms


In [None]:
### test if vector DB was loaded correctly
vectordb.similarity_search('magic creatures')

[Document(page_content='be the work of dark wizards or witches unknown.', metadata={'source': '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Philosophers Stone (J.K. Rowling) (Z-Library).pdf', 'page': 89}),
 Document(page_content='Adalbert Waffling\nA Beginner’s Guide to Transfiguration by \nEmeric Switch\nOne Thousand Magical Herbs and Fungi by \nPhyllida Spore\nMagical Drafts and Potions by \nArsenius Jigger\nFantastic Beasts and Where to Find Them by \nNewt Scamander\nThe Dark Forces: A Guide to Self-Protection by \nQuentin Trimble\n \nOther Equipment\n1 wand\n1 cauldron (pewter, standard size 2)\n1 set glass or crystal phials\n1 telescope\n1 set brass scales\n \nStudents may also bring an owl OR a cat OR a toad\n \nPARENTS ARE REMINDED THAT FIRST-YEARS ARE NOT ALLOWED THEIR OWN\nBROOMSTICKS', metadata={'source': '/content/drive/MyDrive/Harry Potter Books/Harry Potter and the Philosophers Stone (J.K. Rowling) (Z-Library).pdf', 'page': 45}),
 Document(page_content='t

In [None]:
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

In [None]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever,
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [None]:
### testing MMR search
question = "Which are Hagrid's favorite animals?"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(page_content='only too well - he had owned one for a brief pe riod during their first ye ar, a vicious Norwegian \nRidgeback by the name of Norbert. Hagrid simply loved monstrous creatures, the more lethal, \nthe better.  “Well, at least the skrewts are small,” said Ron as  they made their way back up to the castle for \nlunch an hour later.  “They are now,” said Hermione in an exasperated voice, “but once Hagrid’s found out what they eat, I expect they’ll be six feet long.” \n “Well, that won’t matter if they turn out to cure seasickness or something, will it?” said Ron, \ngrinning slyly at her.  “You know perfectly well I only said that to s hut Malfoy up,” said Hermione. “As a matter of \nfact I think he’s right. The best thing to do would be to stamp on th e lot of them before they start', metadata={'source': '/content/drive/MyDrive/Harry Potter Books/Rowling, J.K - Harry Potter 04 - The Goblet of Fire (Rowling, J.K [Rowling, J.K]) (Z-Library).pdf', 'page': 124}),
 Docume

In [None]:
### testing similarity search
question = "Which are Hagrid's favorite animals?"
vectordb.similarity_search(question, k = CFG.k)

[Document(page_content='only too well - he had owned one for a brief pe riod during their first ye ar, a vicious Norwegian \nRidgeback by the name of Norbert. Hagrid simply loved monstrous creatures, the more lethal, \nthe better.  “Well, at least the skrewts are small,” said Ron as  they made their way back up to the castle for \nlunch an hour later.  “They are now,” said Hermione in an exasperated voice, “but once Hagrid’s found out what they eat, I expect they’ll be six feet long.” \n “Well, that won’t matter if they turn out to cure seasickness or something, will it?” said Ron, \ngrinning slyly at her.  “You know perfectly well I only said that to s hut Malfoy up,” said Hermione. “As a matter of \nfact I think he’s right. The best thing to do would be to stamp on th e lot of them before they start', metadata={'source': '/content/drive/MyDrive/Harry Potter Books/Rowling, J.K - Harry Potter 04 - The Goblet of Fire (Rowling, J.K [Rowling, J.K]) (Z-Library).pdf', 'page': 124}),
 Docume

In [None]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )

    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [None]:
def llm_ans(query):
    start = time.time()

    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)

    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [None]:
CFG.model_name

'llama2-13b-chat'

In [None]:
query = "How many challenges does Harry face during the Triwizard Tournament?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

to station themselves around the maze. Bagman now pointed his wand at  his throat, muttered,
“Sonorus,” and his magically magnifi ed voice echoed into the stands.
 “Ladies and gentlemen, the third and final task of  the Triwizard Tournament is about to begin!
Let me remind you how the points currently stand! Tied in first place, with eighty-five points
each - Mr. Cedric Diggory and Mr. Harry Potter, both of Hogwarts School!” The cheers and applause sent birds from the Forbidden Forest fluttering into the darken ing sky. “In second place,
with eighty points - Mr. Viktor Krum, of Durmstrang In stitute!” More applau se. “And in third
place – Miss Fleur Delacour, of Beauxbatons Academy!”  Harry could just make out Mrs. Weasley, Bill , Ron, and Hermione applauding Fleur politely,

Ron

In [None]:
query = "What are horcrux?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

nize a mas ter at work. He could tell that Rid dle want ed the in for ma tion
very, very much; per haps had been work ing to ward this mo ment for weeks.
“Well,” said Slughorn, not look ing at Rid dle, but fid dling with the rib‐
bon on top of his box of crys tal lized pineap ple, “well, it can’t hurt to give
you an overview , of course. Just so that you un der stand t he term. A Hor‐
crux is the word  used for an ob ject in which a per son has con cealed part of
their soul.”
“I don’ t quite un der stand how that works, though, sir ,” said Rid dle.
His voice was care ful ly con trolled, but Har ry could sense his ex cite‐
ment.
“Well, you split your soul, you see,” said Slughorn, “and hide part of it
in an ob ject out side the body . Then, even if one’s body is at tacked or de‐



In [None]:
query = "Give me 5 examples of cool potions and explain what they do"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

Adalbert Waffling
A Beginner’s Guide to Transfiguration by
Emeric Switch
One Thousand Magical Herbs and Fungi by
Phyllida Spore
Magical Drafts and Potions by
Arsenius Jigger
Fantastic Beasts and Where to Find Them by
Newt Scamander
The Dark Forces: A Guide to Self-Protection by
Quentin Trimble

Other Equipment
1 wand
1 cauldron (pewter, standard size 2)
1 set glass or crystal phials
1 telescope
1 set brass scales

Students may also bring an owl OR a cat OR a toad

PARENTS ARE REMINDED THAT FIRST-YEARS ARE NOT ALLOWED THEIR OWN
BROOMSTICKS

Snape’s eyes flashed. He plunged a hand into the in side of his black robes. For one wild moment.
Harry thought Snape was about to pull out his wa nd and curse him - then he saw that Snape had
drawn out a small crystal bottle of a comp letely c

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
! pip install --upgrade gradio -qq
clear_output()

In [None]:
import gradio as gr
print(gr.__version__)

4.29.0


In [None]:
def predict(message, history):
     output = message # debug mode

     output = str(llm_ans(message)).replace("\n", "<br/>")
     return output
demo = gr.ChatInterface(
     predict,
     title = f' Open-Source LLM ({CFG.model_name}) for Harry Potter Question Answering'
 )

demo.queue()
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://8c39114452493fb5d7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


