In [1]:
#Prometheus - Knowledge based chatbot
#Configurable backend for different chatbot engines

#Steps: Build vector indexes
#       Initiate chat engine (onfly)
#       Chat via GRADIO

#------------------------------------------------------------------------------------------------------------------------
#Config - remove keys before checking in
import os
os.environ["OPENAI_API_KEY"] = "MRXp5XCT3BlbkFJdHuVrZl8N6PV8fR6ojoQ"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'YHzwIvgFejEmLIbwbZPFSG'
#------------------------------------------------------------------------------------------------------------------------


In [7]:
#ConstructLLM - gets the selected LLM interface
#make a "const" of HuggingFace_Local, HuggingFace_Hub, OpenAI

from langchain import HuggingFacePipeline
from llama_index import SimpleDirectoryReader, GPTVectorStoreIndex, PromptHelper, LLMPredictor
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceHub
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from langchain.chat_models import ChatOpenAI
from accelerate import init_empty_weights, infer_auto_device_map
from transformers import pipeline
import torch

#Loads a huggingface model locally and pushes it to GPU (CUDA)
def HuggingFace_Local(hfSubModel):
    model_name =  hfSubModel #'lmsys/fastchat-t5-3b-v1.0'
    config = T5Config.from_pretrained(model_name )
    with init_empty_weights():
        model_layer = T5ForConditionalGeneration(config=config)
    device_map = infer_auto_device_map(model_layer,
                                    max_memory={0: "12GiB",1: "12GiB", "cpu": "0GiB"},
                                    no_split_module_classes=["T5Block"])
    model = T5ForConditionalGeneration.from_pretrained(model_name,torch_dtype=torch.float16,device_map=device_map,offload_folder="offload",offload_state_dict=True)    
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    pipe = pipeline("text2text-generation",
            model=model,
            tokenizer=tokenizer,
            device = 0, 
            max_length=1536,
            temperature=0,
            top_p = 1,
            num_beams=1,
            early_stopping=True 
        )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

#HuggingFace Prototyping REST endpoints
def HuggingFace_Hub(hfSubModel):
    llm = HuggingFaceHub(
            repo_id=hfSubModel,
        model_kwargs={'temperature':0.3}
    )
    return llm

#Classic ChatGPT
def OpenAIAPI(subModelNotUsed):
    num_outputs = 512
    llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo") #max_tokens=num_outputs
    return llm

#Contructor for LLM
AITypes=["HuggingFace_Local", "HuggingFace_Hub", "OpenAI"]
SubModels=["EleutherAI/gpt-neo-2.7B","lmsys/fastchat-t5-3b-v1.0","tiiuae/falcon-7b-instruct"]
def ConstructLLMInteface(aiType = "HuggingFace_Local",hfSubModel="lmsys/fastchat-t5-3b-v1.0"):
    if aiType == "HuggingFace_Local":
        return HuggingFace_Local(hfSubModel)
    elif aiType == "HuggingFace_Hub":
        return HuggingFace_Hub(hfSubModel)
    elif aiType == "OpenAI":
        return OpenAIAPI(hfSubModel)
    else:
        raise Exception("Invalid AI Type: " + aiType + " - supported types are: " + str(AITypes))
        return None



In [4]:
#Index Construction - either OpenAI or HuggingFace
#From: https://levelup.gitconnected.com/connecting-chatgpt-with-your-own-data-using-llamaindex-663844c06653
from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper,StorageContext, load_index_from_storage
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceHub
import gradio as gr
import sys
import os


#Build an index using OPENAI to produce the embeddings
def OPENAI_ConstructIndex(directory_path,llm_predictor):
    max_input_size = 4096
    num_outputs = 512
    max_chunk_overlap = 20
    chunk_size_limit = 600
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
    documents = SimpleDirectoryReader(directory_path).load_data()
    
    #LLM passed in from constructor
    #llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo", max_tokens=num_outputs))
    index = GPTVectorStoreIndex.from_documents(documents,llm_predictor=llm_predictor, prompt_helper=prompt_helper)
    index.storage_context.persist(persist_dir="IDX_OpenAI")
    return index

def HF_ConstructIndex(directory_path,llm_predictor):
    max_input_size = 2048
    num_outputs = 512
    max_chunk_overlap = 20
    chunk_size_limit = 300
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap)
    
    #Same hugging face embeddings model no matter which LLM model used
    embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
    service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=LLMPredictor(llm_predictor), prompt_helper=prompt_helper, chunk_size_limit=chunk_size_limit)

    # build index
    documents = SimpleDirectoryReader(directory_path).load_data()
    new_index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
    new_index.storage_context.persist(persist_dir="IDX_HuggingFace")

    return new_index


    # llm = HuggingFaceHub(
    #         #repo_id='google/flan-t5-large',
    #         repo_id='lmsys/fastchat-t5-3b-v1.0',
    #     model_kwargs={'temperature':0.3}
    # )

    # #Prediction Model Selection---------------------------------------------------------------

    # embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

    # # set maximum input size
    # max_input_size = 2048
    # # set number of output tokens
    # num_outputs = 512
    # # set maximum chunk overlap
    # max_chunk_overlap = 20
    # # set chunk size limit
    # chunk_size_limit = 300
    # prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap)

    # service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=LLMPredictor(llm), prompt_helper=prompt_helper, chunk_size_limit=chunk_size_limit)

    # # build index
    # documents = SimpleDirectoryReader('Docs').load_data()

    # new_index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)


    # query_engine = new_index.as_query_engine(
    #     verbose=True,
    #     similarity_top_k=2
    # )


In [13]:
#Test Code (non-interactive)--------------------------------------------------------------------------------------------------------

#Create AI
# llm=ConstructLLMInteface("HuggingFace_Local","lmsys/fastchat-t5-3b-v1.0")

# #Build Index
# index=HF_ConstructIndex("Docs",llm_predictor=llm)

llm=ConstructLLMInteface("OpenAI","")

#Build Index
index=HF_ConstructIndex("Docs",llm_predictor=llm)

prompt="What were his accomplishments?"
response = index.as_query_engine().query(prompt)
print(response.response)

In [6]:
#Web Interface-----------------------------------------------------------------------------------------------------------------------
import gradio as gr
global llm
global index

# Mock actions
def build_indexes(input, model,docfolder):
    global llm
    global index
    #Initiate model and build indexes    
    llm=ConstructLLMInteface(model)
    index=HF_ConstructIndex(docfolder,llm_predictor=llm)
    return "Indexes built successfully using " + model

def chat(input, model):
    global llm
    global index
    response = index.as_query_engine().query(input)
    return response.response        

def main_interface(input, model, docfolder, action):
    if action == "Build Indexes":
        return build_indexes(input,model,docfolder)
    elif action == "Chat":
        return chat(input, model)

sources=["Docs","DocsCountries"]
iface = gr.Interface(
    fn=main_interface, 
    inputs=[
        gr.components.Textbox(lines=2, label="Input",value="What were his accomplishments?"),
        gr.components.Dropdown(choices=AITypes, label="AI Model",value=AITypes[0]),
        gr.components.Dropdown(choices=sources, label="Document Folder",value=sources[0]),
        gr.components.Radio(choices=['Build Indexes', 'Chat'], label="Action",value="Build Indexes")
    ], 
    outputs="text",
    title="Knowledge Chatbot",
    description="Select an AI model and an action to perform. For 'Chat', enter a message to get the model's response."
)

iface.launch()


Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




In [None]:
#zzz------------------------------------------------------------------------------------------------------------------------------
#OpenAI Chatbot - note no direct reference to model or API (higher order API)

def chatbot(input_text):
    #index = GPTVectorStoreIndex.load_from_disk('index.json')

    storage_context = StorageContext.from_defaults(persist_dir="IDX")
    # load index
    index = load_index_from_storage(storage_context)

    response=index.as_query_engine().query(input_text)

    #response = index.query(input_text, response_mode="compact")
    return response.response

iface = gr.Interface(fn=chatbot,
                     inputs=gr.components.Textbox(lines=7, label="Enter your text"),
                     outputs="text",
                     title="Custom-trained AI Chatbot")

iface.launch(share=True)




In [None]:
#zzz------------------------------------------------------------------------------------------------------------------------------
#From: https://levelup.gitconnected.com/connecting-chatgpt-with-your-own-data-using-llamaindex-663844c06653
from llama_index import StorageContext, load_index_from_storage

#Vector store index (index in this case)
#https://gpt-index.readthedocs.io/en/latest/reference/indices/vector_store.html

#https://gpt-index.readthedocs.io/en/latest/reference/service_context.html#llama_index.indices.service_context.ServiceContext

#service_context = service_context

# def my_chatGPT_bot(input_text):
    # load the index from vector_store.json
storage_context = StorageContext.from_defaults(persist_dir=".")
index = load_index_from_storage(storage_context)
index

    # # create a query engine to ask question
    # query_engine = index.as_query_engine()
    # response = query_engine.query(input_text)
    # return response.response
  

#my_chatGPT_bot("Hello world!")

In [None]:
#zzz------------------------------------------------------------------------------------------------------------------------------
from llama_index import SimpleDirectoryReader, GPTVectorStoreIndex, PromptHelper, LLMPredictor
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from accelerate import init_empty_weights, infer_auto_device_map
import torch
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

# model_name = 'lmsys/fastchat-t5-3b-v1.0'

# config = T5Config.from_pretrained(model_name )
# with init_empty_weights():
#     model_layer = T5ForConditionalGeneration(config=config)

# device_map = infer_auto_device_map(model_layer,
#                                 max_memory={0: "12GiB",1: "12GiB", "cpu": "0GiB"},
#                                 no_split_module_classes=["T5Block"])

# # the value for device_map = {'': 0}, i.e. loading the entire Model on 1st GPU
# model = T5ForConditionalGeneration.from_pretrained(model_name,
#                                                    torch_dtype=torch.float16,
#                                                    device_map=device_map,
#                                                    offload_folder="offload",
#                                                    offload_state_dict=True)

# tokenizer = T5Tokenizer.from_pretrained(model_name)

# from transformers import pipeline

# pipe = pipeline(
#     "text2text-generation", 
#     model=model,
#     tokenizer=tokenizer,
#     device = 0, 
#     max_length=1536,
#     temperature=0,
#     top_p = 1,
#     num_beams=1,
#     early_stopping=True 
#     )

#llm = HuggingFacePipeline(pipeline=pipe)

#Prediction Model Selection---------------------------------------------------------------

#Local Host (GPU!)
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceHub



#HuggingFace Prototype AI!
llm = HuggingFaceHub(
        #repo_id='google/flan-t5-large',
        repo_id='lmsys/fastchat-t5-3b-v1.0',
    model_kwargs={'temperature':0.3}
)

#Prediction Model Selection---------------------------------------------------------------

embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

# set maximum input size
max_input_size = 2048
# set number of output tokens
num_outputs = 512
# set maximum chunk overlap
max_chunk_overlap = 20
# set chunk size limit
chunk_size_limit = 300
prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap)

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=LLMPredictor(llm), prompt_helper=prompt_helper, chunk_size_limit=chunk_size_limit)

# build index
documents = SimpleDirectoryReader('Docs').load_data()

new_index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)


query_engine = new_index.as_query_engine(
    verbose=True,
    similarity_top_k=2
)
