In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"]='0,1'

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

device = "cuda" if torch.cuda.is_available() else "cpu"
#Need only 1 GPU if loading 8-bit model
print(device)

print("Using %d GPUs" %torch.cuda.device_count())

import gradio as gr
import time
name = "APS AI Assistant"



cuda
Using 4 GPUs


In [2]:
model_name = "eachadea/vicuna-13b-1.1"
tokenizer_path = "./tokenizer/"

#Create a local tokenizer copy the first time
if os.path.isdir(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
else:
    tokenizer = AutoTokenizer.from_pretrained("model_name")
    os.mkdir(tokenizer_path)
    tokenizer.save_pretrained(tokenizer_path)

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")#, load_in_8bit=True)
pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=1024,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# We are going to set the memory to go back N turns
window_memory = ConversationBufferWindowMemory(k=12)

local_llm = HuggingFacePipeline(pipeline=pipe)

def setup_chain():
    conversation = ConversationChain(
        llm=local_llm, 
        verbose=True, 
        memory=window_memory
    )
    return conversation

conversation = setup_chain()
print(conversation.prompt.template)

The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Human: {input}
AI:


In [5]:
with gr.Blocks(css="footer {visibility: hidden}", title="APS ChatBot") as demo:
    gr.Markdown("""
    # Welcome to the %s!
    Start typing below to see the output.
    """ %name
    )
    chatbot = gr.Chatbot(show_label=False).style(height="500")
    msg = gr.Textbox(label="Send a message with Enter")
    clear = gr.Button("Clear")

    def respond(message, chat_history):
        bot_message = conversation.predict(input=message)
        chat_history.append((message, bot_message))
        time.sleep(1)
        return "", chat_history
    

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: window_memory.clear(), None, chatbot, queue=False)

In [6]:
demo.launch(server_name="0.0.0.0", server_port=2023)

Running on local URL:  http://0.0.0.0:2023

To create a public link, set `share=True` in `launch()`.






[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: Can you write a proof of infinitude of primes, with every line that rhymes?
AI:[0m

[1m> Finished chain.[0m


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: How can coherent imaging be used in materials science?
AI:[0m

[1m> Finished chain.[0m
