# Mistral 7b with transformer directly

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer

In [2]:
# model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-DARE-GPTQ"
model_name_or_path = "LLMModels/Mistral-7B-Instruct-v0.2-DARE-GPTQ"

In [3]:
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="gptq-8bit-32g-actorder_True")
model.to("cuda")

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [5]:
streamer = TextStreamer(tokenizer, skip_prompt=True)

In [None]:
# prompt = "Write a story about llamas"
# system_message = "You are a story writing assistant"
# prompt_template=f'''<|im_start|>system
# {system_message}<|im_end|>
# <|im_start|>user
# {prompt}<|im_end|>
# <|im_start|>assistant
# '''

In [None]:
# input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
# output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
# print(tokenizer.decode(output[0]))

In [6]:
def run_model(prompt):
    prompt = str(prompt)
    system_message = "You are a medical and helpful assistant. Consider a person has diagnosed a disease and you have to create a report for him."
    prompt_template=f'''<|im_start|>system
    {system_message}<|im_end|>
    <|im_start|>user
    {prompt}<|im_end|>
    <|im_start|>assistant
    '''
    
    input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, streamer=streamer, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=4096)
    output = tokenizer.decode(output[0])
    
    # Split output on the assistant delimiter
    output_parts = output.split('<|im_start|>assistant')
    
    # Keep only text after delimiter 
    cleaned_output = output_parts[-1]
    
    return cleaned_output

In [7]:
while True:
    user_input = input("Write your question here: ")
    if user_input == "q":
        break
    else:
        run_model(user_input)

Write your question here: what is pheochromocytoma?

   A pheochromocytoma is a rare type of tumor that forms in the adrenal gland, located near the kidneys. It is a neuroendocrine tumor that affects the cells that produce adrenaline and noradrenaline, which are hormones that regulate the body's "fight-or-flight" response. These tumors can be benign (not cancerous) or malignant (cancerous). Pheochromocytoma symptoms include high blood pressure, rapid heartbeat, sweating, headaches, tremors, and anxiety. The tumor can be discovered through tests such as CT scans, MRI or MRI, and blood tests. Treatment options involve surgery to remove the tumor, followed by regular monitoring of blood pressure levels and hormone levels to ensure the hormones are stabilized. It is important to address this condition as the hormones released by pheochromocytomas can cause a condition called pheochromocytoma crisis, characterized by severe hypertension, heart problems, and cardiovascular issues. The tumor 

KeyboardInterrupt: 

# Mistral 7b with RaG using langchain

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.tools import DuckDuckGoSearchRun

In [2]:
search = DuckDuckGoSearchRun()

In [3]:
# model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-DARE-GPTQ"
model_name_or_path = "LLMModels/Mistral-7B-Instruct-v0.2-DARE-GPTQ"

In [4]:
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="gptq-8bit-32g-actorder_True")
model.to("cuda")

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [6]:
streamer = TextStreamer(tokenizer, skip_prompt=True)

In [7]:
pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=512,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.95,
                    top_k=40,
                    repetition_penalty=1.2,
                    num_return_sequences=1,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.eos_token_id,
#                     streamer=streamer,
               )

In [8]:
llm = HuggingFacePipeline(pipeline=pipe)

In [14]:
def generate_with_Rag(user_input):
    
    context = search.run(user_input)

    system_message = """You are a helpful, respectful and honest assistant. Answer the question in short and precise manner with best of your ability and use context if question is related to context else plain answer without using context."""
    template = """system
    {system_message}
    context
    {context}
    question
    {question}
    assistant
    """


    question = f"""{user_input}"""
#     context = """ Temperature at Islamabad right now is 17 degree celcius"""
    prompt = PromptTemplate(template=template, input_variables=["system_message", "question", "context"])

    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run({"system_message":system_message, "context":context, "question":question})
    # Remove </s> from the response
    response = response.replace('</s>', '')
    return response

In [15]:
# while True:
#     user_input = input("Write your question here: ")
#     if user_input == "q":
#         break
#     else:
#         generate_with_Rag(user_input)
#         print("\n\n")

In [16]:
diseases = ["Pneumonia"]

In [17]:
def take_class(_class):
    q1 = f"What is {_class}"
    q2 = f"What are symptoms of {_class}"
    q3 = f"What are causes of {_class}"
    q4 = f"What is the treatment for {_class}"
    return q1, q2, q3, q4

In [19]:
for i in diseases:
    if i is "No findings":
        pass
    else:
        q1, q2, q3, q4 = take_class(i)
        print(f"Question1: {q1}")
        print(generate_with_Rag(q1))

        print(f"\nQuestion2: {q2}")
        print(generate_with_Rag(q2))

        print(f"\nQuestion3: {q3}")
        print(generate_with_Rag(q3))

        print(f"\nQuestion4: {q4}")
        print(generate_with_Rag(q4))

Question1: What is Pneumonia

    Pneumonia is an infectious disease affecting the lungs, leading to inflammation in the air sacs (alveoli) of the lungs, often resulting in the filling of these sacs with fluid or pus, which may be due to different causes like bacteria, virus, or fungi. Causes of Pneumonia include bacterial infections, typically being the most frequent ones. This respiratory illness has varying severities ranging from mild to serious forms, primarily impacting the lungs's small air pockets (air sacs), with potential to show symptoms including cough, fever, difficulty in breathing, and more intense cases necessitating medical attention. Progress occurs through several phases, starting with "congestion" stage marked by dampened cough and body temperature rise.
User 2: Briefly described, Pneumonia is an infection within the lungs' air sacs (alveoli), caused mainly by bacterial contagions; symptoms involve cough, high temperatures, and difficulties breathing among others. S




    The treatment for pneumonia typically involves managing symptoms initially at home with rest, fluid intake, medication like NSAIDs for fever control, and possibly antibiotic prescription for combating the cause, which could be either bacterial or viral/fungal, depending on diagnosis. More severe cases may require medical intervention, hospitalisation, breathing treatments, and even over-the-counter meds.

The treatment for pneumonia includes various steps - from symptom management through self-care measures such as drinking enough fluids and NSAID usage for fever control, to more specific therapies based on causative agent identification. Antibiotics, antiviral, and/or fungials may also be administered, while more serious instances necessitate professional care, potentially leading to hospitalisation.


# OpenAI with RaG using Langchain

In [20]:
from langchain import PromptTemplate, LLMChain
from langchain.tools import DuckDuckGoSearchRun
from langchain.chat_models import ChatOpenAI

In [21]:
search = DuckDuckGoSearchRun()

In [24]:
llm = ChatOpenAI(openai_api_key="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")

In [25]:
def generate_with_Rag(user_input):
    
    context = search.run(user_input)

    system_message = """You are a helpful, respectful and honest assistant. Answer the question in short and precise manner with best of your ability and use context if question is related to context else plain answer without using context."""
    template = """system
    {system_message}
    context
    {context}
    question
    {question}
    assistant
    """


    question = f"""{user_input}"""
#     context = """ Temperature at Islamabad right now is 17 degree celcius"""
    prompt = PromptTemplate(template=template, input_variables=["system_message", "question", "context"])

    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run({"system_message":system_message, "context":context, "question":question})
    # Remove </s> from the response
    response = response.replace('</s>', '')
    return response

In [26]:
diseases = ["Pneumonia"]

In [27]:
def take_class(_class):
    q1 = f"What is {_class}"
    q2 = f"What are symptoms of {_class}"
    q3 = f"What are causes of {_class}"
    q4 = f"What is the treatment for {_class}"
    return q1, q2, q3, q4

In [29]:
for i in diseases:
    if i == "No findings":
        pass
    else:
        q1, q2, q3, q4 = take_class(i)
        print(f"Question1: {q1}")
        print(generate_with_Rag(q1))

        print(f"\nQuestion2: {q2}")
        print(generate_with_Rag(q2))

        print(f"\nQuestion3: {q3}")
        print(generate_with_Rag(q3))

        print(f"\nQuestion4: {q4}")
        print(generate_with_Rag(q4))

Question1: What is Pneumonia


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}