In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import transformers
import torch

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import SimpleDirectoryReader,ServiceContext,VectorStoreIndex
from llama_index.llms.huggingface import HuggingFaceLLM
import warnings

import gc
warnings.filterwarnings('ignore')

2024-05-05 22:46:11.861031: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-05 22:46:11.941482: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



In [2]:
class Rag_Llama:
    def __init__(self,
                context_window=4096,
                max_new_tokens=256,
                generate_kwargs={"temperature": 0.0, "do_sample": False},
                system_prompt="""""",
                tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
                model_name="meta-llama/Llama-2-7b-chat-hf",
                device_map="cuda:0",
                model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}):
        
        self.context_window= context_window
        self.max_new_tokens= max_new_tokens
        self.generate_kwargs= generate_kwargs
        self.system_prompt=system_prompt
        # query_wrapper_prompt=query_wrapper_prompt,
        self.tokenizer_name= tokenizer_name
        self.model_name= model_name
        self.device_map= device_map
        # uncomment this if using CUDA to reduce memory usage
        self.model_kwargs= model_kwargs


    def load_model(self):
        self.llm = HuggingFaceLLM(
                context_window= self.context_window,
                max_new_tokens= self.max_new_tokens,
                generate_kwargs= self.generate_kwargs,
                system_prompt= self.system_prompt,
                # query_wrapper_prompt=query_wrapper_prompt,
                tokenizer_name= self.tokenizer_name,
                model_name= self.model_name,
                device_map= self.device_map,
                # uncomment this if using CUDA to reduce memory usage
                model_kwargs= self.model_kwargs,
                # llm_int8_enable_fp32_cpu_offload=True
            )

    def load_data(self, data_path = "./data"):
        # try:
        self.documents=SimpleDirectoryReader("./data").load_data()
        if self.documents:
            print("Documents Loaded")
        else:
            print("No Documents found, please check the path or the document format \n The document format must be in pdf")
        # except:
        #     print("Error in loading document, Simple Directory Error")

    
    def call(self, query, embedding_model = "sentence-transformers/all-mpnet-base-v2", data_path = "./data", first = True):
            if first:
                self.load_model()
                self.load_data(data_path)
                self.embed_model=LangchainEmbedding(HuggingFaceEmbeddings(model_name= embedding_model))

                self.service_context=ServiceContext.from_defaults(
                    chunk_size=1024,
                    llm=self.llm,
                    embed_model=self.embed_model
                )

                self.system_prompt ="""
                You are a human being that is trying to converse with an 
                Alzheimer's patient. 
                Use the memories in the data and respond naturally.
                """
                
                self.index=VectorStoreIndex.from_documents(self.documents, service_context = self.service_context)
                self.query_engine=self.index.as_query_engine()
                
                
                
                return self.query_engine
                
    
            # else:
            #     self.load_data(data_path)
            #     self.index=VectorStoreIndex.from_documents(self.documents, service_context = self.service_context)
            #     self.query_engine=self.index.as_query_engine()
                
            #     self.response=self.query_engine.query(query)
                
            #     return self.response
            


In [9]:
# obj = Rag_Llama(context_window=4096,
#                 max_new_tokens=256,
#                 generate_kwargs={"temperature": 0.0, "do_sample": False},
#                 system_prompt="""""",
#                 tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
#                 model_name="meta-llama/Llama-2-7b-chat-hf",
#                 device_map="cuda:0",
#                 model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True})
# query_engine = obj.call("How did the camping trip go?", embedding_model = "sentence-transformers/all-mpnet-base-v2", data_path = "./data", first = True)



In [6]:
def get_response(prompt, query_engine):
        response=query_engine.query(prompt)
        return response

In [10]:
# response=get_response("How is the trip going?", query_engine)
# print(response)

In [8]:
class Plain_Llama:

    def __init__(self,
                model = "meta-llama/Llama-2-7b-chat-hf"):
        self.model = model # meta-llama/Llama-2-7b-hf

    
    def load_plain_model_pipeline(self, device_map="cuda:0"):
        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model, device_map = device_map, load_in_8bit=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model, use_auth_token=True)
    
    
    
        self.llama_pipeline = pipeline(
            "text-generation",  # LLM task
            model=self.model_8bit,
            tokenizer = self.tokenizer,
            torch_dtype=torch.uint8,
            device_map="cuda:0",
        )
    
    def get_plain_llama_response(self,prompt: str) -> None:
        """
        Generate a response from the Llama model.
    
        Parameters:
            prompt (str): The user's input/question for the model.
    
        Returns:
            None: Prints the model's response.
        """
        self.sequences = self.llama_pipeline(
            prompt,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            max_length=256,
        )
    
        return self.sequences[0]

    def call_plain_model(self, prompt = ""):
        self.load_plain_model_pipeline()
        
        self.prompt = prompt
        self.plain_response = self.get_plain_llama_response(self.prompt)

        # self.model_8bit.cpu()
        # del self.model_8bit, checkpoint
        # gc.collect()
        # torch.cuda.empty_cache()
        
        return self.plain_response

In [11]:
# obj2 = Plain_Llama(model = "meta-llama/Llama-2-7b-chat-hf")
# response = obj2.call_plain_model(prompt = "How did the camping trip go?")

# print(response)

In [None]:
# documents=SimpleDirectoryReader("./data").load_data()
# documents

# system_prompt="""
# You are a Q&A assistant. Your goal is to answer questions as
# accurately as possible based on the instructions and context provided.
# """
# ## Default format supportable by LLama2
# # query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# # llm = HuggingFaceLLM(
# #     context_window=4096,
# #     max_new_tokens=256,
# #     generate_kwargs={"temperature": 0.0, "do_sample": False},
# #     system_prompt=system_prompt,
# #     # query_wrapper_prompt=query_wrapper_prompt,
# #     tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
# #     model_name="meta-llama/Llama-2-7b-chat-hf",
# #     device_map="cuda:0",
# #     # uncomment this if using CUDA to reduce memory usage
# #     model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True},
# #     # llm_int8_enable_fp32_cpu_offload=True
# # )

# embed_model=LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

# service_context=ServiceContext.from_defaults(
#     chunk_size=1024,
#     llm=llm,
#     embed_model=embed_model
# )

# index=VectorStoreIndex.from_documents(documents,service_context=service_context)
# query_engine=index.as_query_engine()

# response=query_engine.query("what happened during the camping trip?")

# print(response)