In [1]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device", device)

Using device cuda


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer


if False:
    name_big = "unsloth/Llama-3.2-3B"
    name_small = "unsloth/Llama-3.2-1B"
else:
    name_big = "openai-community/gpt2-medium"
    name_small = "openai-community/gpt2"

model_big = AutoModelForCausalLM.from_pretrained(name_big).to(device)
model_small = AutoModelForCausalLM.from_pretrained(name_small).to(device)


tokenizer = AutoTokenizer.from_pretrained(name_big)

  from .autonotebook import tqdm as notebook_tqdm


# Distillation
В данном задании мы познакомимся с лоссами в дистилляции. *Так как обучения в данном задании нет, то для экономии памяти подсчет функции потерь обернут в torch.no_grad(), при обучении в реальных сценариях этот декоратор нужно обязательно убрать*

## Hard-Label Distillation
Hard-Label дистилляция заключается в том, что мы учимся на метках модели учителя, то есть:
1. Модель учитель размечает какой-то датасет, в нашем случае генерирует продолжения текстов из какого-либо корпуса.
2. Считается обычный CrossEntropyLoss модели студента на сгенерированных текстах в задаче языкового моделирования. **Считать функцию потерь нужно только по сгенерированному тексту, а не по префиксу, по которому функция потерь считалась, т.е. префикс должен быть замаскирован**

Идейно это обучение можно описать так:
мы сгенерировали данных моделью-учителем и просто дообучили на этом модель-ученика.

## Soft-Label Distillation
В этом варианте мы учимся на распределении, которое нам выдает модель-учитель. В soft-label дистилляции мы стремимся не только повторить метки учителя, но и его распределение. Например, если модель учителя выдавала вероятности \[0.7, 0.2, 0.1\], то в Hard-Label дистилляции ученик будет восстанавливать распределение \[1, 0, 0\], а в soft-label \[0.7, 0.2, 0.1\]. В этом нам поможет KL дивергенция.


1. Считаем распределение logits/probs модели-учителя на тексте.
2. Считаем KLDivLoss между выходами модели-ученика на тексте и выходами модели учителя.

В данном виде обучения мы используем не только токены, которые сгенерировала модель учитель, но и ее распределения вероятностей по словарю. Подобная техника дистилляции может помочь модели-ученику лучше моделировать вероятность модели-учителя.

In [3]:
import torch.nn.functional as F

prefix = "Мама мыла раму"
@torch.no_grad()
def hard_label_distillation_loss(model_teacher, model_student, prefix):
    inputs = tokenizer(prefix, return_tensors="pt")
    inputs.to(device)
    prefix_len = inputs["input_ids"].shape[1]

    outputs = model_teacher.generate(**inputs, do_sample=False, max_new_tokens=5, use_cache=True)
    
    students_logits = model_student(input_ids=outputs).logits

    labels = outputs.clone()
    labels[:, :prefix_len] = -100

    # logits = [x1, x2, x3, ..., x(L)]
    # labels = [x0, x1, x2, ..., x(L-1)]
    shift_logits = students_logits[:, :-1, :] # -> logits = [x1, x2, x3, ..., x(L-1)]
    shift_labels = labels[:, 1:] # -> labels = [x1, x2, x3, ..., x(L-1)]

    loss = F.cross_entropy(
        input=shift_logits.view(-1, shift_logits.size(-1)),
        target=shift_labels.view(-1),
        ignore_index=-100
    )
    return loss


@torch.no_grad()
def soft_label_distillation_loss(model_teacher, model_student, text):
    loss_fn = torch.nn.KLDivLoss()
    
    inputs = tokenizer(text, return_tensors="pt")
    inputs.to(device)

    teacher_logits = model_teacher(**inputs).logits
    students_logits = model_student(**inputs).logits

    teacher_probs = F.softmax(teacher_logits)
    students_probs = F.log_softmax(students_logits)
    
    loss = loss_fn(students_probs, teacher_probs)
    return loss

In [4]:
assert abs(hard_label_distillation_loss(model_big, model_small, prefix).item() - 1.3893) < 1e-3
assert abs(soft_label_distillation_loss(model_big, model_small, prefix).item() - 7.0790e-06) < 1e-3
print("Тесты прошли успешно")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Тесты прошли успешно


  teacher_probs = F.softmax(teacher_logits)
  students_probs = F.log_softmax(students_logits)


# Speculative Decoding
В этом задании необходимо написать спекулятивное декодирование на pytorch. **Генерации необходимо делать жадно.**

1. Генерируете n токенов маленькой моделью
2. Проверяете, выберет ли эти токены большая модель при жадной генерации (должен быть вызван один forward большой модели, вызывать big_model.generate на этом этапе нельзя)
3. Если все токены выбраны большой моделью, принимаете их и возвращаетесь на шаг 1
4. Если какой-то токен выбран ошибочно, подаете вместо него правильный токен с шага 2 и возвращаетесь на шаг 1.m

In [9]:
@torch.no_grad()
def speculative_generate(big_model, small_model, prefix, max_num_tokens, n):
    input_ids = tokenizer(prefix, return_tensors="pt").input_ids.to(device)
    start_size = input_ids.size(1)
    while input_ids.size(1) - start_size < max_num_tokens:
        # [prefix_1, ..., prefix_len, gen_1, ..., gen_k]
        small_generation = small_model.generate(
            input_ids=input_ids,
            do_sample=False,
            max_new_tokens=n,
            use_cache=True
        )
        num_generated_tokens = small_generation.size(1) - input_ids.size(1)

        # [prefix_2, ..., prefix_len, gen_1, ..., gen_k+1]
        big_model_logits = big_model(input_ids=small_generation).logits

        # [gen_1, ..., gen_k+1]
        big_model_generations = big_model_logits[:, -num_generated_tokens - 1:].argmax(dim=2)
        mismatch = False
        for i in range(num_generated_tokens):
            # нашли расхождение
            if big_model_generations[0, i] != small_generation[0, input_ids.size(1) + i]:
                mismatch = True
                # Если оно сразу, то берем первый предсказанный большой моделью токен
                if i == 0:
                    input_ids = torch.cat([input_ids, big_model_generations[:, 0:1]], dim=1)
                # иначе берем часть токенов, предсказанных маленькой моделью + правильный токен от большой модели
                else:
                    accepted_small = small_generation[:, :input_ids.size(1) + i]
                    input_ids = torch.cat([accepted_small, big_model_generations[:, i:i+1]], dim=1)
                print(f"Accepted {i}/{n} tokens")
                break
            else:
                print(f"Accepted {n}/{n} tokens")


        if not mismatch:
            # если расхождений не было, принимаем всю последовательность + последний токен от большой модели
            input_ids = torch.cat([small_generation, big_model_generations[:, -1:]], dim=1)
    return tokenizer.decode(input_ids[0, start_size:start_size + max_num_tokens].cpu().tolist())

In [10]:
prompt = """
# SYSTEM PREAMBLE
1) You are an excellent Python software developer with over 10 years of experience. You have a strong understanding of Python related topics, data structures, libraries, frameworks, algorithms, best practices and optimization techniques.
2) You are here to help the user (the software developer) by breaking his request in ## TASK into logical steps and writing high-quality and efficient code to implement each step.
3) You have to return the entire code.
4) Follow "Answering rules" without exception.

## ANSWERING RULES
1) Repeat the question before answering it.
2) Always follow "CHAIN OF THOUGHTS" to execute the task.

## CHAIN OF THOUGHTS
1) **OBEY the EXECUTION MODE**
2) **TASK ANALYSIS:**
   - Understand the user's request thoroughly.
   - Identify the key components and requirements of the task.
3) **PLANNING: CODDING:**
   - Break down the task into logical, sequential steps.
   - Outline the strategy for implementing each step.
4) **CODING:**
   - Explain your thought process before writing any code.
   - Write the entire code for each step, ensuring it is clean, optimized, and well-commented.
   - Handle edge cases and errors appropriately.
5) **VERIFICATION:**
   - Review the complete code solution for accuracy and efficiency.
   - Ensure the code meets all requirements and is free of errors.

## TASK

Write a python function that receives the following JSON as input and enters data from it into the Google Sheet.

{
    'date': '31-05-2024',
    'revenue': 90000,
    'person' : 'User1',
    'expensesList': [30000, 14000, 10000, 2000, 15000],
    'expensesDescList': [ 'Ключи', 'Ключи2', 'Счет за такси', 'Клей, пластины', 'Провод 40м'],
    'expensesTypeList': ['Закупки', 'Закупки', 'Расходы', 'Ремонт', 'Ремонт']
}

There is a date in JSON, you can use it to determine the month.
The data is entered into a list with the name of the month. If such a list does not exist yet, then you need to create a list with a new month inside the sheet.

The list should have the following columns (the first rows are used as headings):
A1: Дата расхода,
B1: сумма расхода,
C1: описание расхода,
D1: тип расхода,
E1: кто внес данные

G1: Дата выручки
H1: Сумма выручки
I1: Кто внес данные

Please separate expenses and profits with a blank column.
Please sort expenses by date, including those already listed in Google sheet list.
Please sort earnings by date, including those already listed in Google sheet list.

It is prohibited to use oauth2client as it is deprecated.
"""

In [11]:
model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
res_big = tokenizer.batch_decode(model_big.generate(**model_inputs, do_sample=False, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)[:, model_inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
print(res_big)


## ANSWERING RULES
1) Do not use oauth2client.

2) Do not use oauth2client as it is deprecated.

3) Do not use oauth2client as it is deprecated.

4) Do not use oauth2client as it is deprecated.

5) Do not use oauth2client as it is deprecated.

## VERIFICATION

1) Review the complete code solution for accuracy and efficiency.

2) Ensure the code meets all requirements and is free of errors.

## TASK




In [12]:
res_spec = speculative_generate(big_model=model_big, small_model=model_small, prefix=prompt, max_num_tokens=128, n=5)
assert res_spec == res_big

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 2/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 1/5 tokens
Accepted 5/5 tokens
Accepted 1/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 2/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 3/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 0/5 tokens
Accepted 0/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 0/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 1/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 5/5 tokens
Accepted 1/5 tokens


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Accepted 0/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens
Accepted 5/5 tokens


In [13]:
res_spec == res_big

True

## HF speculative decoding
Теперь попробуйте использовать функцию спекулятивного декодирования из [transformers](https://huggingface.co/docs/transformers/main/en/generation_strategies#speculative-decoding)

In [15]:
import time
inputs = tokenizer(prompt, return_tensors="pt").to(device)


start = time.time()
outputs = model_big.generate(**inputs, do_sample=False, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)
# print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
print(f"Elapsed time for big model inference {time.time() - start}")


start = time.time()
outputs = model_big.generate(**inputs, do_sample=False, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, assistant_model=model_small)
# print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
print(f"Elapsed time for speculative {time.time() - start}")

Elapsed time for big model inference 3.072702407836914
Elapsed time for speculative 2.1723568439483643


In [16]:
del model_big, model_small