In [1]:
!pip install transformers accelerate bitsandbytes
!pip install scipy  # needed for 4-bit quantization



In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_mistral_efficiently():
    # 1. Load in 8-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        device_map="cuda",  # Automatically handle device placement
        load_in_8bit=True,  # Enable 8-bit quantization
        torch_dtype=torch.float16  # Use half precision
    )

    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

    return model, tokenizer

# Function to generate text with the model
def generate_text(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [14]:
model, tokenizer = load_mistral_efficiently()
response = generate_text("Please repeat this sentence: pol", model, tokenizer)
print(response)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Please repeat this sentence: polly put the kettle on

Polly put the kettle on.


In [16]:
response = generate_text("Please EXACTLY REPEAT THIS: pod", model, tokenizer)
print(response)

Please EXACTLY REPEAT THIS: podman run -dt --name my-php-app -v /var/www/html:/var/www/html -p 80:80 php:8.0-fpm

This command does the following:

1. `podman run`: This command is used to run a container using Podman, which is an open-source container runtime similar to Docker.
2.


In [35]:
def get_mistral_response(instruction, model, tokenizer, max_length=100):
    # Format the prompt according to Mistral's instruction format
    prompt = f"<s>[INST] {instruction} [/INST]"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    print(inputs)
    embeddings = model.get_input_embeddings()(inputs['input_ids'])
    print(embeddings)
    print(embeddings.shape)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,  # Limit the response length
        num_return_sequences=1,
        temperature=0.01,  # Even lower temperature for more deterministic output
        do_sample=False,   # Disable sampling for deterministic output
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.0  # Don't penalize repetition since we want exact copying
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [36]:
instruction = "Repeat exactly what is in between the <COPY> tags. <COPY>dsfsdfsdfsd</COPY>"
response = get_mistral_response(instruction, model, tokenizer)
print(response)

{'input_ids': tensor([[    1,     1, 28792, 16289, 28793,  1298, 15882,  4668,   767,   349,
           297,  1444,   272,   523,  1998, 11519, 28767, 12944, 28723,   523,
          1998, 11519, 28767,  3673,  2547, 28715,  2547, 28715,  2547, 28715,
           700,  1998, 11519, 28767,   733, 28748, 16289, 28793]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[[-4.3640e-03, -1.0633e-04, -5.6152e-03,  ..., -5.0545e-05,
          -1.1520e-03,  1.5926e-04],
         [-4.3640e-03, -1.0633e-04, -5.6152e-03,  ..., -5.0545e-05,
          -1.1520e-03,  1.5926e-04],
         [-6.7139e-04, -5.7983e-04, -3.1891e-03,  ..., -1.7071e-04,
           3.1281e-04,  8.5449e-04],
         ...,
         [-9.0790e-04,  1.2741e-03, -7.4005e-04,  ...,  3.1090e-04,
          -4.0283e-03,  2.5558e-04],
         [ 1.4496e-04,  5.0354e-04, -2.3499e-03,  ...



[INST] Repeat exactly what is in between the <COPY> tags. <COPY>dsfsdfsdfsd</COPY> [/INST] dsfsdfsdfsd
