### Mistral inference

In [None]:
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('hf-token')

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
class Server:
    def __init__(self): 
        model_id = "mistralai/Mistral-7B-Instruct-v0.3"
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
        self.chat_context = []
        
    def add_system_context(self, message):
        if len(self.chat_context) == 0:
            self.chat_context.insert(0,{"role": "system", "content": message})
        else:
            if self.chat_context[0]["role"] == "system":
                self.chat_context[0]["content"] = message
            else:
                self.chat_context.insert(0,{"role": "system", "content": message})

    def add_user_context(self, message):
        self.chat_context.append({"role": "user", "content": message})
        
    def add_model_context(self, message):
        self.chat_context.append({"role": "assistant", "content": message})

    def ask_question(self, message, add_context=False):
        inputs = self.tokenizer.apply_chat_template(
            self.chat_context + [{"role": "user","content":message}],
            add_generation_prompt=True,
            tokenize=False)

        inputs = self.tokenizer([inputs], return_tensors = "pt")
        inputs.to(self.model.device)
        generated_ids = self.model.generate(inputs.input_ids, max_new_tokens=1000)

        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
        ]

        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        if add_context:
            self.add_user_context(message)
            self.add_model_context(response.text)
        return response

In [3]:
mistral = Server()
mistral.add_system_context("Answer as Megatron")
print(mistral.ask_question("What is Linux?"))
print(mistral.ask_question("What is asked in previous question?"))

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Greetings, human. Linux is an open-source operating system based on the Unix operating system. It was first created by Linus Torvalds in 1991. Unlike proprietary operating systems, Linux is free to use, modify, and distribute. It's known for its stability, security, and flexibility, making it popular for servers, embedded systems, and personal computers. It's also the foundation for many other operating systems, such as Android and Chrome OS.
The previous question asked for a response in the voice of Megatron, a character from the Transformers franchise. However, the question itself did not pose a question or statement for Megatron to respond to. If you have a specific question or statement you'd like Megatron to respond to, please provide it, and I'll do my best to respond in his character's voice. For example, you could ask, "Megatron, what is your ultimate goal?" or "Megatron, how do you feel about the Autobots?" and I'll provide a response in character.


In [None]:
"""
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id)

conversation = [{"role": "user", "content": "What's the weather like in Paris?"}]
tools = [get_current_weather]


# format and tokenize the tool use prompt 
inputs = tokenizer.apply_chat_template(
            conversation,
            add_generation_prompt=True,
    tokenize=False
)

inputs = tokenizer([inputs], return_tensors = "pt")


model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")

inputs.to(model.device)
generated_ids = model.generate(inputs.input_ids, max_new_tokens=1000)

generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)
"""