### Importing packages

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import accelerate
import torch
import time
from pprint import pprint

### Declaring text generation model, tokenizer, computational device and optional streamer

In [13]:
# setting device
gpu=0
device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.set_device(device)
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 SUPER'

In [14]:
# Define model name and hf token
name = "TheBloke/Llama-2-7b-Chat-GPTQ"
# name = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

# hugginf face auth token
file_path = "../../huggingface_credentials.txt"
with open(file_path, "r") as file:
    auth_token = file.read().strip()

In [15]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(name
    # ,cache_dir='./model/'
    ,use_auth_token=auth_token
    ,device_map='cuda'                 
    )



In [16]:
# Define model
model = AutoModelForCausalLM.from_pretrained(name
    ,cache_dir=r"C:\Users\user2\.cache\huggingface\hub"
    # ,cache_dir='./model/'
    ,use_auth_token=auth_token
    ,device_map='cuda'  
    # , torch_dtype=torch.float16
    # ,low_cpu_mem_usage=True
    # ,rope_scaling={"type": "dynamic", "factor": 2}
    # ,load_in_8bit=True,
    ).to(device)




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: D:\NLP 1\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary D:\NLP 1\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...




In [17]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

### Declare inference function

In [18]:
def llm_inference(plain_text, model, tokenizer, device, streamer=None, max_length=4000, ):
    input_ids = tokenizer(
        plain_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
        )['input_ids'].to(device)
    
    output_ids = model.generate(input_ids
                        ,streamer=streamer
                        ,use_cache=True
                        ,max_new_tokens=float('inf')
                       )
    answer = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
    return answer


### Generating texts using a trained model

In [19]:
text = "what are the steps to train a machine learning model? explain in less than 100 words"
res = llm_inference(text, model, tokenizer, device, streamer=streamer,)
res



.
To train a machine learning model, you typically follow these steps:
1. Collect and preprocess data.
2. Choose a machine learning algorithm.
3. Split the data into training and validation sets.
4. Train the model on the training set.
5. Evaluate the model's performance on the validation set.
6. Fine-tune the model as needed.
7. Test the final model on new data.


"what are the steps to train a machine learning model? explain in less than 100 words.\nTo train a machine learning model, you typically follow these steps:\n1. Collect and preprocess data.\n2. Choose a machine learning algorithm.\n3. Split the data into training and validation sets.\n4. Train the model on the training set.\n5. Evaluate the model's performance on the validation set.\n6. Fine-tune the model as needed.\n7. Test the final model on new data."

### Setup Vector database

In [22]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.llms import HuggingFaceLLM
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import set_global_service_context
from llama_index import ServiceContext
from llama_index import VectorStoreIndex, download_loader
from llama_index import SimpleDirectoryReader
from pathlib import Path

In [23]:
db = chromadb.PersistentClient(path="../../vdb")

In [24]:
# get collection
chroma_collection = db.get_or_create_collection("quickstart")

In [25]:
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [27]:
# Create a system prompt
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as
helpfully as possible, while being safe. Your answers should not include
any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please don't share false information.

Your goal is to provide answers based on the information provided and your other knowledges.<</SYS>>
"""

query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

In [29]:
llm = HuggingFaceLLM(context_window=4096,
                    max_new_tokens=512,
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

embeddings = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [31]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)

# And set the service context
set_global_service_context(service_context)

In [32]:
# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

In [34]:
# load some documents
documents = SimpleDirectoryReader(r"D:\NLP 1\RAG-webapp\documents_db").load_data()

# create your index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [38]:
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

### Idle answering

In [44]:
# create a query engine and query
query_engine = index.as_query_engine()
response = query_engine.query("What farshad is currently doing?").response
response

' Based on the information provided in the PDF, Farshad Amiri is currently a full-time student and does not have a job. He is living on a scholarship and financial aid from his father, and he is unable to access international money transfer systems such as Visa or Mastercards due to U.S sanctions. He is interested in learning machine learning and deep learning, and he believes that completing this Coursera certificate program will help him improve his CV and enhance his chances of getting a job in the field of AI.'

### Stream answering

In [45]:
# create a query engine and query
query_engine = index.as_query_engine(streaming=True)
query_engine.query("What farshad is currently doing?").print_response_stream()

 Based on the information provided in the PDF, Farshad Amiri is currently a full-time student and does not have a job. He is living in Iran and is unable to access international money transfer systems such as Visa or Mastercards due to U.S sanctions. He is relying on a scholarship and financial aid to support his living expenses. Additionally, he is interested in learning machine learning and deep learning, and has been practicing coding for over six months. He believes that completing this Coursera certificate program will help him enhance his CV and improve his chances of getting a job in the field of AI.</s>