In [1]:
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

torch_device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct").to(torch_device)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading shards: 100%|████████████████████████| 2/2 [09:50<00:00, 295.38s/it]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:25<00:00, 12.95s/it]


In [5]:
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import Markdown

@dataclass
class Utterance:
    role: Literal["user", "assistant"]
    content: str

    def as_json(self) -> dict[str, str]:
        return {"role": self.role, "content": self.content}

    def as_markdown(self) -> str:
        return "\n".join([
            f"#### {self.role.capitalize()}",
            *self.content.splitlines(),
        ])

@dataclass
class Chat:
    utterances: list[Utterance]

    def as_json(self) -> list[dict[str, str]]:
        return list(map(Utterance.as_json, self.utterances))

    def as_markdown(self) -> Markdown:
        return Markdown("\n\n".join(map(Utterance.as_markdown, self.utterances)))

    def next(self, utterance: Utterance) -> Chat:
        return Chat(self.utterances + [utterance])

    def assistant(self, content: str) -> Chat:
        return self.next(Utterance(role="assistant", content=content))

    def user(self, content: str) -> Chat:
        return self.next(Utterance(role="user", content=content))

    def __getitem__(self, index: int | slice) -> Chat:
        item = self.utterances[index]
        if not isinstance(item, list):
            item = [item]
        return Chat(item)
    

@torch.inference_mode()
def generate_chat(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    chat: str | Chat,
    max_new_tokens: int = 100,
    do_sample: bool = False,
    **kwargs,
) -> Chat:
    if isinstance(chat, str):
        chat = Chat([Utterance(role="user", content=chat)])
    chat_input = tokenizer.apply_chat_template(
        chat.as_json(),
        return_tensors="pt",
        padding="longest",
    )
    chat_input = chat_input.to(model.device)
    generated_ids = model.generate(
        chat_input,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        pad_token_id=tokenizer.pad_token_id,
        **kwargs,
    )
    output = tokenizer.decode(
        generated_ids[0, chat_input.shape[1] :],
        skip_special_tokens=True,
    )
    output = output.strip()
    response = Utterance(role="assistant", content=output)
    return chat.next(response)

In [6]:
task = """
You are an expert in the field of deep learning.
I am going to provide you with the text of a
paper and I want you to summarize the paper.
We will then have a conversation about the paper.

You should only use details from the paper. If the
paper does not answer a question then you should
state that you do not know the answer.
"""

In [7]:
chat = generate_chat(
    model=model,
    tokenizer=tokenizer,
    chat=task,
)

chat.as_markdown()

You are not running the flash-attention implementation, expect numerical differences.


#### User

You are an expert in the field of deep learning.
I am going to provide you with the text of a
paper and I want you to summarize the paper.
We will then have a conversation about the paper.

You should only use details from the paper. If the
paper does not answer a question then you should
state that you do not know the answer.

#### Assistant
I need to create a C++ class for a 3D graphics engine that manages a collection of 3D models. This class should inherit from a base class that handles a generic collection of objects. It should be able to add, remove, and retrieve models by index, and also by name. The models should be stored in a vector. The class should have a constructor that takes a reference to a scene object and a string for the model's name. It should