In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install datasets
!pip install sacrebleu

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
from datasets import load_dataset, load_metric, Dataset
from transformers import EncoderDecoderModel
from transformers import AutoTokenizer, AutoModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

encoder_tokenizer = AutoTokenizer.from_pretrained('SIKU-BERT/sikuroberta')
decoder_tokenizer= AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')


In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/train_24-historoes_c_utf8.txt", 'r', encoding='utf-8') as f:
    lines1 = [line.strip() for line in f]
    train_size = int(len(lines1) * 0.995)
    train1 = lines1[:train_size]
    val1 = lines1[train_size:]
with open("/content/drive/MyDrive/Colab Notebooks/train_24_histories_m_utf8.txt", 'r', encoding='utf-8') as f:
    lines2 = [line.strip() for line in f]
    train2 = lines2[:train_size]
    val2 = lines2[train_size:]

train_data=Dataset.from_dict({"source": train1, "target": train2})
val_data=Dataset.from_dict({"source": val1, "target": val2})

In [None]:
encoder_max_length=192
decoder_max_length=192

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = encoder_tokenizer(batch["source"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = decoder_tokenizer(batch["target"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == decoder_tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
  return batch

In [None]:
batch_size = 28
# batch_size=4

train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["source", "target"]
)

val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["source", "target"]
)

In [None]:
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
)

val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
)

In [None]:
"""
#from transformers import EncoderDecoderConfig, RobertaConfig

#encoder_config = RobertaConfig.from_pretrained('roberta-large', dropout=0.2)
#decoder_config = RobertaConfig.from_pretrained('roberta-large', dropout=0.2)

#config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config, tie_encoder_decoder=True)
"""

In [None]:
#model = EncoderDecoderModel(config=config)
model = EncoderDecoderModel.from_encoder_decoder_pretrained('SIKU-BERT/sikuroberta','SIKU-BERT/sikuroberta',tie_encoder_decoder=True)
model.config.decoder_start_token_id = decoder_tokenizer.cls_token_id
model.config.eos_token_id = decoder_tokenizer.sep_token_id
model.config.pad_token_id = decoder_tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

model.config.max_length = 192
model.config.min_length = 6
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
#model.config.repetition_penalty = 1.2
model.config.num_beams = 4

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    save_strategy='steps',
    eval_steps=2000,
    save_steps=4000,
    output_dir="./",
    fp16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    #weight_decay=0.005,
    num_train_epochs=3,
    warmup_steps=2000
)

In [None]:
# load bleu for validation
bleu = load_metric("sacrebleu")

import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = decoder_tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, decoder_tokenizer.pad_token_id)
    decoded_labels = decoder_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
data_collator = DataCollatorForSeq2Seq(decoder_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=decoder_tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model('test_model')