In [None]:
# Import transformer classes for generaiton
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer,pipeline
# Import torch for datatype attributes 
import torch
from environs import Env
from transformers import StoppingCriteria, StoppingCriteriaList, BitsAndBytesConfig, AutoConfig
from torch import cuda, bfloat16

In [None]:
env = Env()
env.read_env(path='.env')
HG_TOKEN = env.str("hugging_face_token")

MODEL_ID = 'meta-llama/Llama-2-7b-chat-hf'
DEVICE = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
print(f'Using device: {DEVICE}')

In [None]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID,
                                          cache_dir='./model/',
                                          token=HG_TOKEN)

In [None]:
torch.cuda.is_available()

In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )
# model_config = AutoConfig.from_pretrained(
#     MODEL_ID,
#     use_auth_token=HG_TOKEN
# )

In [None]:
# Create model
model = AutoModelForCausalLM.from_pretrained(MODEL_ID,
                                             cache_dir='./model/',
                                             token=HG_TOKEN,
                                             torch_dtype=torch.float16, 
                                             rope_scaling={"type": "dynamic", "factor": 2},
                                             load_in_8bit=True,
                                             device_map="auto")

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     cache_dir='./model/',
#     trust_remote_code=True,
#     config=model_config,
#     quantization_config=bnb_config,
#     # rope_scaling={"type": "dynamic", "factor": 2},
#     device_map='auto',
#     token=HG_TOKEN
# )

In [None]:
# # Setup a prompt 
# prompt = "### User:What is the fastest car in  \
#           the world and how much does it cost? \
#           ### Assistant:"
prompt = "### User:what is Random Forest Model? \
          ### Assistant:"
          

# Pass the prompt to the tokenizer
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Setup the text streamer
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [None]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(DEVICE) for x in stop_token_ids]


# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [None]:
generate_text = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    streamer=streamer,
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.7,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens= 1024,
    # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [None]:
generate_text(prompt, num_return_sequences=1)

In [None]:
from llama_index.prompts.prompts import SimpleInputPrompt
# Create a system prompt 

In [None]:
from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate

# Text QA Prompt
chat_text_qa_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            "Always answer the question, even if the context isn't helpful."
        ),
    ),
    ChatMessage(
        role=MessageRole.USER,
        content=(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information and not prior knowledge, "
            "answer the question: {query_str}\n"
        ),
    ),
]
text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

In [None]:
# Import the prompt wrapper...but for llama index
from llama_index.prompts.prompts import SimpleInputPrompt
# Create a system prompt 
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as 
helpfully as possible, while being safe. Your answers should not include
any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain 
why instead of answering something not correct. If you don't know the answer 
to a question, please don't share false information.

Your goal is to provide answers relating to the financial performance of 
the company.<</SYS>>
"""
# Throw together the query wrapper
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

In [None]:
# Complete the query prompt
query_wrapper_prompt.format(query_str='hello')

In [None]:
from langchain import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
# Import the llama index HF Wrapper
# from llama_index.llms import HuggingFaceLLM
# # Create a HF LLM using the llama index wrapper 
# llm = HuggingFaceLLM(context_window=4096,
#                     max_new_tokens=256,
#                     system_prompt=system_prompt,
#                     query_wrapper_prompt=query_wrapper_prompt,
#                     model=model,
#                     tokenizer=tokenizer)

In [None]:
# Bring in embeddings wrapper
from llama_index.embeddings import LangchainEmbedding
# Bring in HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [None]:
# Create and dl embeddings instance  
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [None]:
# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext

In [None]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)
# And set the service context
set_global_service_context(service_context)

In [None]:
# Import deps to load documents 
from llama_index import VectorStoreIndex, download_loader

In [None]:
# Download PDF Loader 
PyMuPDFReader = download_loader("PyMuPDFReader")
# Create PDF Loader
loader = PyMuPDFReader()
# Load documents 
documents = loader.load(file_path='./data/handsOn.pdf', metadata=True)

In [None]:
# Create an index - we'll be able to query this in a sec
index = VectorStoreIndex.from_documents(documents)

In [None]:
# Setup index query engine using LLM 
query_engine = index.as_query_engine()

In [None]:
# Test out a query in natural
response = query_engine.query("what is the difference between supervised and unsupervised learning?")

In [None]:
response.response