# Создание чат‑бота на базе BERT‑подобной модели (BERT2BERT) с дообучением на публичном диалоговом датасете DailyDialog.

### Установка зависимостей

In [None]:
# Install libraries
!pip -q install -U transformers datasets evaluate accelerate rouge_score

### Импорты, фиксирование seed, выбор устройства (CPU/GPU).

In [None]:
import os
import random
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    GenerationConfig # Import GenerationConfig
)
import evaluate

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Загрузка OpenRL/daily_dialog


In [None]:
dataset = load_dataset("OpenRL/daily_dialog")
dataset

README.md:   0%|          | 0.00/892 [00:00<?, ?B/s]

data/train-00000-of-00001-f151c79abb2c1f(…):   0%|          | 0.00/3.61M [00:00<?, ?B/s]

data/validation-00000-of-00001-2407eb323(…):   0%|          | 0.00/334k [00:00<?, ?B/s]

data/test-00000-of-00001-66dc7d981b70c91(…):   0%|          | 0.00/331k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})

### Просмотр примера диалога

In [None]:
# Example dialogue
dataset['train'][0]['dialog']


['Say , Jim , how about going for a few beers after dinner ? ',
 ' You know that is tempting but is really not good for our fitness . ',
 ' What do you mean ? It will help us to relax . ',
 " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
 " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
 ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
 " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
 ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
 " Good.Let ' s go now . ",
 ' All right . ']

### Построение пар (вопрос -> ответ)
Мы рассматриваем каждую соседнюю пару высказываний как (вопрос -> ответ).


In [None]:
def make_pairs(batch):
    sources, targets = [], []
    for dialog in batch['dialog']:
        for i in range(len(dialog) - 1):
            src = dialog[i]
            tgt = dialog[i + 1]
            sources.append(src)
            targets.append(tgt)
    return {'source': sources, 'target': targets}

train_pairs = dataset['train'].map(
    make_pairs,
    batched=True,
    remove_columns=dataset['train'].column_names,
)
val_pairs = dataset['validation'].map(
    make_pairs,
    batched=True,
    remove_columns=dataset['validation'].column_names,
)
test_pairs = dataset['test'].map(
    make_pairs,
    batched=True,
    remove_columns=dataset['test'].column_names,
)

train_pairs, val_pairs, test_pairs


Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

(Dataset({
     features: ['source', 'target'],
     num_rows: 76052
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 7069
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 6740
 }))

### Tokenization
Мы используем "bert-base-uncased" как кодер, так и декодер (BERT2BERT).


In [None]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure BOS/EOS exist (BERT doesn't define them by default)
if tokenizer.bos_token is None:
    tokenizer.bos_token = tokenizer.cls_token
if tokenizer.eos_token is None:
    tokenizer.eos_token = tokenizer.sep_token

max_source_len = 64
max_target_len = 64

def tokenize_function(batch):
    # Input: question (user utterance)
    model_inputs = tokenizer(
        batch['source'],
        max_length=max_source_len,
        truncation=True,
        padding='max_length',
    )
    # Output: answer (next utterance)
    labels = tokenizer(
        text_target=batch['target'],
        max_length=max_target_len,
        truncation=True,
        padding='max_length',
    )
    # Mask PAD tokens in labels
    labels_ids = [
        [(tok if tok != tokenizer.pad_token_id else -100) for tok in seq]
        for seq in labels['input_ids']
    ]
    model_inputs['labels'] = labels_ids
    return model_inputs

train_tok = train_pairs.map(tokenize_function, batched=True, remove_columns=train_pairs.column_names)
val_tok = val_pairs.map(tokenize_function, batched=True, remove_columns=val_pairs.column_names)
test_tok = test_pairs.map(tokenize_function, batched=True, remove_columns=test_pairs.column_names)

train_tok




Map:   0%|          | 0/76052 [00:00<?, ? examples/s]

Map:   0%|          | 0/7069 [00:00<?, ? examples/s]

Map:   0%|          | 0/6740 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 76052
})

### BERT2BERT model
Cоздание модели, настройка спец‑токенов и generation_config.


In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

# Special tokens for generation
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.bos_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

# Default generation parameters (set on generation_config if available)
gen_cfg = getattr(model, 'generation_config', None)
if gen_cfg is not None:
    gen_cfg.decoder_start_token_id = tokenizer.cls_token_id
    gen_cfg.bos_token_id = tokenizer.cls_token_id
    gen_cfg.eos_token_id = tokenizer.sep_token_id
    gen_cfg.pad_token_id = tokenizer.pad_token_id
    gen_cfg.max_length = 64
    gen_cfg.num_beams = 4
    gen_cfg.no_repeat_ngram_size = 2
    gen_cfg.early_stopping = True
else:
    # Fallback for older Transformers versions
    model.config.max_length = 64
    model.config.num_beams = 4
    model.config.no_repeat_ngram_size = 2
    model.config.early_stopping = True

model.to(device)


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]

BertLMHeadModel LOAD REPORT from: bert-base-uncased
Key                                                                | Status     | 
-------------------------------------------------------------------+------------+-
bert.pooler.dense.bias                                             | UNEXPECTED | 
cls.seq_relationship.bias                                          | UNEXPECTED | 
cls.seq_relationship.weight                                        | UNEXPECTED | 
bert.pooler.dense.weight                                           | UNEXPECTED | 
bert.encoder.layer.{0...11}.crossattention.output.LayerNorm.weight | MISSING    | 
bert.encoder.layer.{0...11}.crossattention.self.query.weight       | MISSING    | 
bert.encoder.layer.{0...11}.crossattention.output.LayerNorm.bias   | MISSING    | 
bert.encoder.layer.{0...11}.crossattention.self.value.bias         | MISSING    | 
bert.encoder.layer.{0...11}.crossattention.output.dense.weight     | MISSING    | 
bert.encoder.layer.{0...11}.crossat

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

### Training


In [None]:
# Optional: speed up training on a smaller subset
# train_tok = train_tok.select(range(50000))
# val_tok = val_tok.select(range(5000))

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

import inspect
_eval_arg = 'eval_strategy' if 'eval_strategy' in inspect.signature(Seq2SeqTrainingArguments.__init__).parameters else 'evaluation_strategy'

training_args_kwargs = dict(
    output_dir='bert_dialogue_bot',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=2,
    eval_steps=2000,
    save_steps=2000,
    save_total_limit=2,
    logging_steps=200,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model='rougeL',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to='none',
)
training_args_kwargs[_eval_arg] = 'steps'

training_args = Seq2SeqTrainingArguments(**training_args_kwargs)

rouge = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 with pad_token_id before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}

trainer_kwargs = dict(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_sig = inspect.signature(Seq2SeqTrainer.__init__).parameters
if 'tokenizer' in trainer_sig:
    trainer_kwargs['tokenizer'] = tokenizer
elif 'processing_class' in trainer_sig:
    trainer_kwargs['processing_class'] = tokenizer

trainer = Seq2SeqTrainer(**trainer_kwargs)

trainer.train()




Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
2000,3.09484,2.987062,0.1201,0.0265,0.1141,0.114
4000,2.903097,2.816363,0.1402,0.0371,0.1331,0.1331
6000,2.550627,2.745201,0.1466,0.042,0.1394,0.1395
8000,2.480697,2.679991,0.1579,0.0461,0.1499,0.1498


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['decoder.cls.predictions.decoder.weight', 'decoder.cls.predictions.decoder.bias'].


TrainOutput(global_step=9508, training_loss=2.8220353951010826, metrics={'train_runtime': 3746.9142, 'train_samples_per_second': 40.594, 'train_steps_per_second': 2.538, 'total_flos': 1.166360281242624e+16, 'train_loss': 2.8220353951010826, 'epoch': 2.0})

### Evaluation on the test set


In [None]:
test_metrics = trainer.evaluate(test_tok, max_length=64)
test_metrics




{'eval_loss': 2.71454119682312,
 'eval_rouge1': 0.1529,
 'eval_rouge2': 0.0443,
 'eval_rougeL': 0.1443,
 'eval_rougeLsum': 0.1445,
 'eval_runtime': 300.2245,
 'eval_samples_per_second': 22.45,
 'eval_steps_per_second': 1.406,
 'epoch': 2.0}

### Сохранение модели


In [None]:
save_dir = 'bert_dialogue_bot_final'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
save_dir


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

'bert_dialogue_bot_final'

### Генерация ответов


In [None]:
# Generate many examples
def generate_batch(texts, max_len=64):
    inputs = tokenizer(
        texts,
        return_tensors='pt',
        truncation=True,
        padding=True
    ).to(device)
    output_ids = model.generate(
        **inputs,
        max_length=max_len,
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=True,
    )
    return tokenizer.batch_decode(output_ids, skip_special_tokens=True)

prompts = [
    "Hi! How are you today?",
    "What do you want for dinner?",
    "Can you help me with my homework?",
    "Do you like traveling?",
    "What's your favorite movie?",
    "I'm feeling a bit tired сегодня.",
    "What time is the meeting?",
    "Could you recommend a good book?",
    "Let's go for a walk this evening.",
    "I think it's going to rain."
]

responses = generate_batch(prompts, max_len=64)

for i, (q, a) in enumerate(zip(prompts, responses), 1):
    print(f"{i}. Q: {q}")
    print(f"   A: {a}\n")


1. Q: Hi! How are you today?
   A: fine, thanks. how about you?

2. Q: What do you want for dinner?
   A: i ' d like to have a hamburger.

3. Q: Can you help me with my homework?
   A: sure. what do you want to learn?

4. Q: Do you like traveling?
   A: yes, i do.

5. Q: What's your favorite movie?
   A: it ' s a thriller.

6. Q: I'm feeling a bit tired сегодня.
   A: what ' s the matter?

7. Q: What time is the meeting?
   A: it starts at 8 o ' clock.

8. Q: Could you recommend a good book?
   A: of course. it ' s very good.

9. Q: Let's go for a walk this evening.
   A: ok. i ' ll be back in a minute.

10. Q: I think it's going to rain.
   A: i don ' t think so.



### Вывод по результатам проекта

Модель успешно обучена: лосс снижается, ROUGE‑метрики постепенно растут (примерно с ~0.11 до ~0.15 по ROUGE‑L). Генерация даёт связные, но короткие и немного шаблонные ответы — это нормальный базовый уровень для DailyDialog. Для улучшения качества стоит: увеличить контекст (2–3 реплики), обучить дольше, использовать более подходящий датасет, добавить семантические метрики (BERTScore/BLEURT) и настроить параметры генерации.