### Importing packages

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import accelerate
import torch
import time
from pprint import pprint

### Declaring text generation model, tokenizer, computational device and optional streamer

In [2]:
# setting device
gpu=0
device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.set_device(device)
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 SUPER'

In [3]:
# Define model name and hf token
name = "TheBloke/Llama-2-7b-Chat-GPTQ"
# name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

# hugginf face auth token
# file_path = "../../huggingface_credentials.txt"
# with open(file_path, "r") as file:
#     auth_token = file.read().strip()

In [4]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(name
    # ,cache_dir='./model/'
    # ,use_auth_token=auth_token
    ,device_map='cuda'                 
    )

In [16]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Define model
model = AutoModelForCausalLM.from_pretrained(name
    ,cache_dir=r"C:\Users\user2\.cache\huggingface\hub"
    # ,cache_dir='./model/'
    # ,use_auth_token=auth_token
    ,device_map='cuda'  
    # , torch_dtype=torch.float16
    # ,low_cpu_mem_usage=True
    # ,rope_scaling={"type": "dynamic", "factor": 2}
    # ,load_in_8bit=True,
    ).to(device)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: D:\NLP 1\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary D:\NLP 1\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


In [7]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

### Declare inference function

In [8]:
def llm_inference(plain_text, model, tokenizer, device, streamer=None, max_length=4000, ):
    input_ids = tokenizer(
        plain_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
        )['input_ids'].to(device)
    
    output_ids = model.generate(input_ids
                        ,streamer=streamer
                        ,use_cache=True
                        ,max_new_tokens=float('inf')
                       )
    answer = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
    return answer


### Generating texts using a trained model

In [20]:
text = "what are the steps to train a machine learning model? explain in less than 100 words"
res = llm_inference(text, model, tokenizer, device, streamer=streamer,)
res

.
To train a machine learning model, you typically follow these steps:
1. Collect and preprocess data.
2. Choose a machine learning algorithm.
3. Split the data into training and validation sets.
4. Train the model on the training set.
5. Evaluate the model's performance on the validation set.
6. Fine-tune the model as needed.
7. Test the final model on new data.


"what are the steps to train a machine learning model? explain in less than 100 words.\nTo train a machine learning model, you typically follow these steps:\n1. Collect and preprocess data.\n2. Choose a machine learning algorithm.\n3. Split the data into training and validation sets.\n4. Train the model on the training set.\n5. Evaluate the model's performance on the validation set.\n6. Fine-tune the model as needed.\n7. Test the final model on new data."

### Setup Vector database

In [19]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader, get_response_synthesizer
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.llms import HuggingFaceLLM
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import set_global_service_context
from llama_index import ServiceContext
from llama_index import VectorStoreIndex, download_loader
from llama_index import SimpleDirectoryReader
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.vector_stores import MilvusVectorStore
from pathlib import Path

### Chroma db

In [21]:
# Chroma db

db = chromadb.PersistentClient(path="../../vdb")

# get collection
chroma_collection = db.get_or_create_collection("quickstart")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [22]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [23]:
# Create a system prompt
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as
helpfully as possible, while being safe.
If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please don't share false information.
Try to be exact in information and numbers you tell.
Your goal is to provide answers based on the information provided and your other knowledge.<</SYS>>
"""

query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

In [24]:
llm = HuggingFaceLLM(context_window=4096,
                     max_new_tokens=512,
                     system_prompt=system_prompt,
                     query_wrapper_prompt=query_wrapper_prompt,
                     model=model,
                     tokenizer=tokenizer)

embeddings = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [25]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    chunk_overlap=20,
    llm=llm,
    embed_model=embeddings
)

# And set the service context
set_global_service_context(service_context)

### Load Vector DB

In [26]:
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

### Insert a single document into the vector db

In [27]:
PyMuPDFReader = download_loader("PyMuPDFReader")
loader = PyMuPDFReader()

# Load documents
doc_dir = r"D:\NLP 1\RAG-webapp\documents_db\Sattelite imagery article scripts.pdf"
document = loader.load(file_path=Path(doc_dir), metadata=False)

# Create indexes
# for doc in document:
#     index.insert(doc, )

### Insert directory of documents into the vector db

In [17]:
# load some documents
documents = SimpleDirectoryReader(r"D:\NLP 1\RAG-webapp\documents_db").load_data()

# create your index
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context
# )

### Customizing query engine

In [28]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    doc_id = ["fe0ab12b-146f-45e2-9161-387ac90f8031"]
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(streaming=True)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.2)],
)


### Inferencing

In [32]:
# create a query engine and query
response = query_engine.query("who studied Master of Science in Management with a background in Civil Engineering?")
# response = query_engine.query("how many gold medals Iranian youth won in 2023 chess competitions?")
# response = query_engine.query("describe key points of 2023 climate change?")
# response = query_engine.query("how much headline inflation increased after storm shock?")
# response = query_engine.query("how many gold medals Iranian youth won in 2023 chess competitions?")
response.print_response_stream()

 Based on the information provided in the PDF, the person who studied Master of Science in Management with a background in Civil Engineering is Farshad Amiri.</s>

In [31]:
for node in response.source_nodes:
    print(node.score)
response.source_nodes[0]

0.23824048972736303
0.21386191409572436


NodeWithScore(node=TextNode(id_='20ff3158-cfd4-4063-b656-0e8f5b5ac693', embedding=None, metadata={'page_label': '1', 'file_name': 'Iran medals in Asia competitions 2023.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5502d6ee-b6f1-4dd5-8406-26e96215d078', node_type=None, metadata={'page_label': '1', 'file_name': 'Iran medals in Asia competitions 2023.pdf'}, hash='3de8a565a78341bb04481d2b423e39bf0923e55979253e643aa0a340d9d04135')}, hash='51e80e575f1121004378dcc6fb6fc45dbb08e7462a1d7fc3c1111e25ecd6eb23', text='Iran wins six medals in 2023 Asian Youth \nChess Championships  \n \nTEHRAN – The Iranian chess players claimed six medals at the 25th Asian Youth Chess \nChampionships.  \nRamtin Kakavand won two gold medals in the boys’  under 10 standard and blitz.  \nRosha Akbari claimed a bronze in the girls’ under 12 blitz.  \nMohammadtaha   Arkak seized a bronze in the boys’ under 8 blitz.  \nKanaa