In [1]:
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationChain
from langchain.llms import HuggingFacePipeline
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import HumanMessage, AIMessage
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
import torch
import openai
from langchain.llms import OpenAI
from dotenv import load_dotenv, find_dotenv
from app.schema import HistoryItem, ChatItem
from typing import List

In [2]:
DEVICE_TYPE = "cuda"
# model_id = "atom92/medical-meta-llama-2-7b"
# model_id = "atom92/power-medical-meta-llama-2"
model_id = "atom92/medical_lama_ultra"
# model_id = "atom92/medical-token-llama-2-healthwa-3"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
# logging.info("Tokenizer loaded")

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map={"": 0},
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    max_memory={0: "17GiB", 1: "17GiB"},
    offload_folder="/tmp/offload"
)
model.tie_weights()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=254,
    do_sample=True,
    temperature=0.9,
    top_p=0.5,
    top_k=50,
    repetition_penalty=1.1,
)
local_llm = HuggingFacePipeline(pipeline=pipe)
window_memory = ConversationBufferWindowMemory(ai_prefix="AI Assistant", k=2, return_messages=True)

In [8]:
template = """You are a helpful virtual medical assistant that delivers precise, succinct information from established medical knowledge. You only respond in English clearly, precisely and briefly.

Current conversation:
{history}
Human: {input}
AI Assistant:"""
PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)

In [9]:
conversation = ConversationChain(
    prompt=PROMPT,
    llm=local_llm,
    verbose=True,
    memory=window_memory
)

In [10]:
conversation.run("What are the signs and symptoms of haemolytic uraemic syndrome (HUS)?")

  warn_deprecated(




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful virtual medical assistant that delivers precise, succinct information from established medical knowledge. You only respond in English clearly, precisely and briefly.

Current conversation:
[]
Human: What are the signs and symptoms of haemolytic uraemic syndrome (HUS)?
AI Assistant:[0m

[1m> Finished chain.[0m


' The primary sign of HUS is microscopic hematuria. <END_A>'

In [None]:
conversation.predict(input="What should I do when I have flu?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mAs a general medical knowledge assistant, I aim to provide accurate and general information based on existing knowledge. However, always consult with a healthcare professional for personalized advice. Based on general medical literature,

Current conversation:
[]
Human: What should I do when I have flu?
AI Assistant:[0m


In [None]:
conversation.run("What should I do when I have flu?")

  warn_deprecated(




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mAs a general medical knowledge assistant, I aim to provide accurate and general information based on existing knowledge. However, always consult with a healthcare professional for personalized advice. Based on general medical literature,

Current conversation:
[]
Human: What should I do when I have flu?
AI Assistant:[0m


In [None]:
async def llm_answer(query: str):
        conversation = ConversationChain(
            prompt=self._prompt,
            llm=self._llm,
            verbose=True,
            memory=self.generate_memory(history)
        )
        # print(self.window_memory.load_memory_variables({}))
        return conversation.run(query)

    @staticmethod
    def generate_memory(history: List[HistoryItem]):
        memory = ConversationBufferWindowMemory(ai_prefix="AI Assistant", k=2, return_messages=True)
        for history_item in history:
            memory.save_context({"input": history_item.human_message}, {"output": history_item.ai_message})
        return memory

In [1]:
!ngrok authtoken 2Wj8zemKeISGyOt5IETUO93Y4M0_2ZxZGg6ZJQcyC81TfuT7B

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!ngrok http 8000

7[?47h[?1h=[H[2J[m[38;5;6m[48;5;16m[1m[1;1Hngrok[m[38;5;16m[48;5;16m                                                           [m[38;5;7m[48;5;16m(Ctrl+C to quit)[m[38;5;16m[48;5;16m[2;1H                                                                                [m[38;5;6m[48;5;16m[3;1HSession Status                connecting[m[38;5;16m[48;5;16m                                        [m[38;5;7m[48;5;16m[4;1HVersion                       3.5.0[m[38;5;16m[48;5;16m                                             [m[38;5;7m[48;5;16m[5;1HLatency                       -[m[38;5;16m[48;5;16m                                                 [m[38;5;7m[48;5;16m[6;1HWeb Interface                 http://127.0.0.1:4040[m[38;5;16m[48;5;16m                             [7;1H                                                                                [m[38;5;7m[48;5;16m[8;1HConnections                   ttl     opn     rt1     rt5     p50     p9