<a href="https://colab.research.google.com/github/GTimothee/nlp-tools/blob/main/chat_with_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup
- change runtime to GPU

## Install dependencies

In [1]:
!sudo pip3 install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1


In [2]:
!pip install autoawq

Collecting autoawq
  Downloading autoawq-0.2.8.tar.gz (71 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<=4.47.1,>=4.45.0 (from autoawq)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting datasets>=2.20 (from autoawq)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.20->autoawq)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.20->autoawq)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.20->autoawq)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=2.20->autoawq)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.6.0->autoawq)
  Dow

## Implementing chat template manually

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")


prompt = """<s>[INST] Convert the text to English and return the result as a JSON object with an "answer" key.
Do not include any additional text outside the JSON object. Don't add any other message, just return the JSON.

Example:
- Input: "bonjour"
- Expected Output:
{
  "answer": "hello"
}

Now here is the input : Maison [/INST]
"""

from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
response = pipe(prompt, max_new_tokens=512)[0]['generated_text']
print(response)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>[INST] Convert the text to English and return the result as a JSON object with an "answer" key.  
Do not include any additional text outside the JSON object. Don't add any other message, just return the JSON.

Example:
- Input: "bonjour"
- Expected Output:
{
  "answer": "hello"
}

Now here is the input : Maison [/INST]
{
  "answer": "house"
}


## Using chat templates

In [4]:
print(tokenizer.chat_template)


{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}


In [7]:
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")


messages = [
    {"role": "user", "content": "Convert the text 'Maison' to English and return it as a JSON object with an 'answer' key."}
]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
response = pipe(formatted_prompt, max_new_tokens=512)[0]['generated_text']
print(response)

ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

# test if llama 3 can take SYSTEM prompts

In [1]:
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "casperhansen/llama-3-8b-instruct-awq"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")


messages = [
    {"role": "system", "content": "Translate user input to English as a JSON object."},
    {"role": "user", "content": "input: Maison"},
]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
response = pipe(formatted_prompt, max_new_tokens=512)[0]['generated_text']
print(response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Translate user input to English as a JSON object.<|eot_id|><|start_header_id|>user<|end_header_id|>

input: Maison<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here is the translation of "Maison" to English as a JSON object:

```
{
    "translation": "House"
}
```


## Using langchain with it

In [3]:
%pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [5]:

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

model_name = "casperhansen/llama-3-8b-instruct-awq"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=0.001,
)

# wrap llm into a langchain object
from langchain_huggingface import ChatHuggingFace
from langchain_huggingface import HuggingFacePipeline

langchain_pipe = HuggingFacePipeline(pipeline=pipe)
critique_llm = ChatHuggingFace(llm=langchain_pipe)

# define chain with langchain
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

critique_chain = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an assistant that translates inputs into English and returns the result in a JSON format. Your response should only contain a valid JSON object.",
            ),
            ("human", "{user_message}"),
        ]
    ) | critique_llm | JsonOutputParser()


critique = critique_chain.invoke({
        "user_message": "input: Maison"
    }
)
print(critique)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [7]:
critique_chain = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an assistant that translates inputs into English and returns the result in a JSON format. Your response should only contain a valid JSON object.",
            ),
            ("human", "{user_message}"),
        ]
    ) | critique_llm | JsonOutputParser()


critique = critique_chain.invoke({
        "user_message": "input: Maison"
    }
)

print(critique)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


OutputParserException: Invalid json output: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an assistant that translates inputs into English and returns the result in a JSON format. Your response should only contain a valid JSON object.<|eot_id|><|start_header_id|>user<|end_header_id|>

input: Maison<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{"translation": "House"}
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 