In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/raima-new-with-mistral/new 3 (1).pdf
/kaggle/input/raima-new-with-mistral/new2 (1).pdf


In [2]:
! nvidia-smi

Mon Oct 14 06:12:15 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             10W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [3]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain<0.4.0,>=0.3.3 (from langchain-community)
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain-community)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-community)
  Downloading langsmith-0.1.134-py3-none-any.whl.metadata (13 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain<0.4.0,>=0.3.3->langchain-community)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting packaging<25,>=23.2 (from langchain-core<0.4.0,>=0.3.10->langchain-community)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Collecting requests-

In [4]:
%%time

from IPython.display import clear_output

! pip install -qq -U langchain
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu

! pip install sentence_transformers==2.2.2
! pip install -qq -U InstructorEmbedding

! pip install -qq -U transformers 
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes

clear_output()

CPU times: user 1.6 s, sys: 389 ms, total: 1.98 s
Wall time: 2min 7s


In [5]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time
import gc

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate

### vector stores
from langchain_community.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch

import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

CPU times: user 12 s, sys: 1.49 s, total: 13.5 s
Wall time: 19.5 s


In [6]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.3.3
torch: 2.4.0
transformers: 4.45.2


In [7]:
len(glob.glob('/kaggle/input/raima-new-with-mistral/*'))

2

In [8]:
class CFG:
    DEBUG = False
    
    # LLM
    model_name = 'microsoft/Phi-3-mini-128k-instruct'
    temperature = 0.4
    top_p = 0.90
    repetition_penalty = 1.15
    max_len = 8192
    max_new_tokens = 512

    # splitting
    split_chunk_size = 800
    split_overlap = 400
    
    # embeddings
    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'

    # similar passages
    k = 6
    
    # paths
    PDFs_path = '/kaggle/input/raima-new-with-mistral/'
    Embeddings_path =  '/kaggle/input/faiss-ml-papers-st'
    Output_folder = './ml-papers-vectordb'

In [9]:
loader = DirectoryLoader(
    CFG.PDFs_path,
    glob = "./*3215v3.pdf" if CFG.DEBUG else "./*.pdf",
    loader_cls = PyPDFLoader,
    show_progress = True,
    use_multithreading = True
)

documents = loader.load()

100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


In [10]:
print(f'We have {len(documents)} pages in total')

We have 15 pages in total


In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 134 chunks from 15 pages


In [12]:
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
    
    print('Creating embeddings...\n\n')

    ### download embeddings model
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = texts, 
        embedding = embeddings
    )

    ### persist vector database
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_ml_papers") # save in input folder

clear_output()

In [13]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
#     CFG.Embeddings_path, # from input folder
    CFG.Output_folder + '/faiss_index_ml_papers', # from output folder
    embeddings,
    allow_dangerous_deserialization = True,
)

clear_output()

In [14]:
%%time

### test if vector DB was loaded correctly
vectordb.similarity_search('scaling laws')

CPU times: user 136 ms, sys: 44.2 ms, total: 180 ms
Wall time: 211 ms


[Document(metadata={'source': '/kaggle/input/raima-new-with-mistral/new 3 (1).pdf', 'page': 0}, page_content='This is an open access article under the CC BY -SA license.  \n \nCorresponding Author:  \nAshwini V. Zadgaonkar  \nDepartment of Information Technology  \nShri Ramdeobaba College of Engineering and Management  \nNagpur, 440013, India  \nEmail: ashwinizadgaonkar24@gmail.com  \n \n \n1. INTRODUCTION   \nNowadays a lot of information is available on the internet in a structured and unstructured form \nstored in multiple documents. This information belongs to different domai ns and needs to be analyzed and \nprocessed to extract the desired piece of information for a particular task. M anual processing and analy sis of \nsuch a large repository of documents demand too much efforts and it will be ve ry much time consuming also.'),
 Document(metadata={'source': '/kaggle/input/raima-new-with-mistral/new 3 (1).pdf', 'page': 3}, page_content='machine learning model. The model is built 

In [15]:
pip install -U bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [16]:
def build_model(model_repo = CFG.model_name):

    print('\nDownloading model: ', model_repo, '\n\n')

    ### tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_repo)

    ### quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.float16,
        bnb_4bit_use_double_quant = True,
    )        

    ### model
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        quantization_config = bnb_config,
        device_map = 'auto',
        low_cpu_mem_usage = True,
        trust_remote_code = True,
    )

    return tokenizer, model

In [17]:
tokenizer, model = build_model(model_repo = CFG.model_name)


Downloading model:  microsoft/Phi-3-mini-128k-instruct 




tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [18]:
gc.collect()

51

In [19]:
model.eval()

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_fe

In [20]:
model.hf_device_map

{'model.embed_tokens': 0,
 'model.embed_dropout': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 0,
 'model.layers.13': 1,
 'model.layers.14': 1,
 'model.layers.15': 1,
 'model.layers.16': 1,
 'model.layers.17': 1,
 'model.layers.18': 1,
 'model.layers.19': 1,
 'model.layers.20': 1,
 'model.layers.21': 1,
 'model.layers.22': 1,
 'model.layers.23': 1,
 'model.layers.24': 1,
 'model.layers.25': 1,
 'model.layers.26': 1,
 'model.layers.27': 1,
 'model.layers.28': 1,
 'model.layers.29': 1,
 'model.layers.30': 1,
 'model.layers.31': 1,
 'model.norm': 1,
 'lm_head': 1}

In [21]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.bos_token_id
]


### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    
    model = model,
    
    tokenizer = tokenizer,
#     pad_token_id = tokenizer.eos_token_id,
    eos_token_id = terminators,
    
    do_sample = True,
#     max_length = CFG.max_len,
    max_new_tokens = CFG.max_new_tokens,
    
    
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty,
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

  llm = HuggingFacePipeline(pipeline = pipe)


In [24]:
prompt_template = """
<|system|>

You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).

You are given some extracted parts from machine learning papers along with a question.

If you don't know the answer, just say "I don't know." Don't try to make up an answer.

It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.

Use only the following pieces of context to answer the question at the end.

<|end|>

<|user|>

Context: {context}

Question is below. Remember to answer in the same language:

Question: {question}

<|end|>

<|assistant|>

"""


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [25]:
prompt_template = """
<|system|>

You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).

You are given some extracted parts from machine learning papers along with a question.

If you don't know the answer, just say "I don't know." Don't try to make up an answer.

Answer the question **only** in English, regardless of the language of the context or question.

Use only the following pieces of context to answer the question at the end.

<|end|>

<|user|>

Context: {context}

Question is below. Answer in English:

Question: {question}

<|end|>

<|assistant|>
"""

In [26]:
retriever = vectordb.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": CFG.k}
)

In [27]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [28]:
def wrap_text_preserve_newlines(text, width=1500):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    
    ### return only the text after the pattern
    pattern = "<|assistant|>"
    index = ans.find(pattern)
    if index != -1:
        ans = ans[index + len(pattern):]    
    
    return ans.strip()

def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [29]:
!pip install googletrans==4.0.0-rc1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.10.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [30]:
# Create a translation pipeline for English
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")  # Change 'es' to the detected source language if necessary
query = "Tell me about Bert"
result = llm_ans(query)
clear_output()

# Translate the result to English
translated_result = translator(result, max_length=400)[0]['translation_text']

# Print the translated result
print(translated_result)

Your input_length: 433 is bigger than 0.9 * max_length: 400. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Bert, also known as Bidirectional Encoder Representations from Transformers or BiLSTM with multilanguage encoder for the understanding of natural language is a model developed by researchers led by Jacob Devlin and his colleagues at Google AI. This model was officially introduced in the article published in Arxiv.org in 2018 under the title "BERT: pre-training of deep bidirectional transformers for language understanding." The main purpose of this work was to improve the capabilities of systems based on deep neural networks to understand the semantic meaning behind words and phrases within a given sentence. To achieve this, a method called tokenization WordPiece that allows to divide complex words into smaller particles without losing their general meaning when they are processed individually by intelligent machines. In addition, these authors have provided public access both to the source code and to specialized data sets related to this technology through properly available public re

In [31]:
query = "Tell me about Bert"
result = llm_ans(query)
clear_output()
print(result)

Bert, también conocido como Bidirectional Encoder Representations from Transformers o BiLSTM con codificador multifrecuencia para el procesamiento del lenguaje natural en español y francés respectivamente, es un modelo desarrollado por Devlin *et al.* que se enfoca principalmente en la comprensión profunda del lenguaje humano mediante técnicas avanzadas basadas en redes neuronales. Este modelo utiliza una arquitectura bidireccional conocida como transformer biLSTM, lo cual permite analizar tanto las palabras previas como siguientes dentro de oraciones dando lugar a mejorar significativamente su capacidad para entender los matices semánticos y sintácticos presentes en el discurso lingüístico. A diferencia de otros métodos tradicionales utilizados anteriormente, Bert logra alcanzar resultados más cercanos a aquellos obtenidos por personas altamente competentes en idiomas complejos debido a sus habilidades sofisticadas de reconocimiento y interpretación de patrones lingüísticos intrincado

In [32]:
query = "Bert, también conocido como?"
result = llm_ans(query)
clear_output()
print(result)

Bert es conocida por su nombre en inglés, que significa 'Bert'. Sin embargo, el término utilizado para referirse específicamente al modelo pre-entrenado desarrollado por Google se llama 'BERT', siglas del título original English Transformers Pre-Trained for Natural Language Understanding.' Idem est usum qui potestur non habet ut res quaeritur. Elaborated Textbook-level Solution would involve explaining how named entities within natural languages often have names or titles associated with them which might differ across various linguistic communities due to translation nuances. For instance, while 'bert' could simply mean someone called Bert without any specific connotation outside certain cultural references, when referring specifically to technology terms especially those coined after people’s names ('Bobby'), these may carry additional weight because they directly associate human identity with technical innovation—in essence creating brand identities around products derived from indiv

In [None]:
from transformers import pipeline

In [None]:
query = "O que é Fusão de Recursos Multiclasse??"
result = llm_ans(query)
clear_output()
print(result)

In [None]:
query = "Población de la base de conocimientos para el procesamiento de textos legales."
result = llm_ans(query)
clear_output()
print(result)

In [33]:
import os
import time
import textwrap
import gc
import torch

### langchain imports
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain_community.vectorstores import FAISS

### transformers imports
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig, pipeline
)

# Configuration class
class CFG:
    DEBUG = False
    
    # LLM
    model_name = 'microsoft/Phi-3-mini-128k-instruct'
    temperature = 0.4
    top_p = 0.90
    repetition_penalty = 1.15
    max_len = 8192
    max_new_tokens = 512

    # splitting
    split_chunk_size = 800
    split_overlap = 400
    
    # embeddings
    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'

    # similar passages
    k = 6
    
    # paths
    PDFs_path = '/kaggle/input/raima-new-with-mistral/'
    Embeddings_path =  '/kaggle/input/faiss-ml-papers-st'
    Output_folder = './ml-papers-vectordb'

# Function to load documents from PDFs
loader = DirectoryLoader(
    CFG.PDFs_path,
    glob = "./*.pdf",
    loader_cls = PyPDFLoader,
    show_progress = True,
    use_multithreading = True
)

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

# Loading or creating embeddings
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )
    vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
else:
    vectordb = FAISS.load_local(
        CFG.Output_folder + '/faiss_index_ml_papers',
        embeddings,
        allow_dangerous_deserialization=True,
    )

# Model building function
def build_model(model_repo=CFG.model_name):
    print('\nDownloading model: ', model_repo, '\n\n')

    tokenizer = AutoTokenizer.from_pretrained(model_repo)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        quantization_config=bnb_config,
        device_map='auto',
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )

    return tokenizer, model

# Create the LLM pipeline
tokenizer, model = build_model(model_repo=CFG.model_name)
model.eval()
terminators = [tokenizer.eos_token_id, tokenizer.bos_token_id]

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=terminators,
    do_sample=True,
    max_new_tokens=CFG.max_new_tokens,
    temperature=CFG.temperature,
    top_p=CFG.top_p,
    repetition_penalty=CFG.repetition_penalty,
)

llm = HuggingFacePipeline(pipeline=pipe)

# Template for the prompt
prompt_template = """
<|system|>

You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).

You are given some extracted parts from machine learning papers along with a question.

If you don't know the answer, just say "I don't know." Don't try to make up an answer.

It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.

Use only the following pieces of context to answer the question at the end.

<|end|>

<|user|>

Context: {context}

Question is below. Remember to answer in the same language:

Question: {question}

<|end|>

<|assistant|>
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

retriever = vectordb.as_retriever(
    search_type="similarity",
    search_kwargs={"k": CFG.k}
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
    verbose=False
)

# Wrapping function for preserving newlines
def wrap_text_preserve_newlines(text, width=1500):
    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

# Function to calculate accuracy (this is a placeholder and can be customized)
def calculate_accuracy(predicted, actual):
    if predicted == actual:
        return 100
    else:
        accuracy = len(set(predicted.split()).intersection(set(actual.split()))) / len(actual.split()) * 100
        return round(accuracy, 2)

# Processing the LLM response and appending accuracy
def process_llm_response(llm_response, expected_answer=""):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    ans = ans + '\n\nSources: \n' + sources_used

    pattern = "<|assistant|>"
    index = ans.find(pattern)
    if index != -1:
        ans = ans[index + len(pattern):]
    
    ans = ans.strip()

    if expected_answer:
        accuracy = calculate_accuracy(ans, expected_answer)
        ans += f"\n\nAccuracy: {accuracy}%"

    return ans

# Main function to get LLM answer with accuracy
def llm_ans(query, expected_answer=""):
    start = time.time()
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response, expected_answer)
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

# Example usage
query = "Tell me about Bert"
expected_answer = "BERT is a transformer-based model designed by Google for natural language understanding tasks."
result = llm_ans(query, expected_answer)
print(result)

100%|██████████| 2/2 [00:01<00:00,  1.46it/s]


load INSTRUCTOR_Transformer
max_seq_length  512

Downloading model:  microsoft/Phi-3-mini-128k-instruct 




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Bert, también conocido como Bidirectional Encoder Representations from Transformers o BiLSTM con codificadores multilínea para el entendimiento del lenguaje natural en español, es un modelo desarrollado por investigadores liderados por Jacob Devlin y sus colegas. Este algoritmo utiliza una red neuronal profunda bidireccional basada en transformadores que tiene la capacidad de comprender el significado semántico completo de las palabras dentro del contexto dado. Esto significa que puede reconocer cómo diferentes combinaciones de palabras pueden tener distintos significados dependiendo de su posición relativa entre sí. Por ejemplo, mientras 'un banco deposita dinero', se refiere a finanzas bancarias, otra oración podría referirse a un banco río abajo utilizando este tipo de tecnología lingüística avanzada. El uso principal de Bert ha sido pre-entrenar sobre grandes cantidades de datos literarios disponibles públicamente antes de aplicarlo a varias tareas específicas relacionadas con proc