In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model

Load datasets

In [3]:
polemo = load_dataset("clarin-pl/polemo2-official")
polemo_n_cls = len(polemo['train'].features['target'].names)

In [4]:
kpwr = load_dataset("clarin-pl/kpwr-ner")
val_test = kpwr['test'].train_test_split(test_size=0.5, seed=42)

kpwr = DatasetDict({
    'train': kpwr['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})
kpwr_n_cls = len(kpwr['train'].features['ner'].feature.names)

### BERT

In [5]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

model_seq = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=polemo_n_cls)

model_token = AutoModelForTokenClassification.from_pretrained("allegro/herbert-base-cased", num_labels=kpwr_n_cls)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load PEFT adapters

In [8]:
config_seq = LoraConfig(
    task_type="SEQ_CLS",
)

config_token = LoraConfig(
    task_type="TOKEN_CLS",
)

model_seq_peft = get_peft_model(model_seq, config_seq)
model_token_peft = get_peft_model(model_token, config_token)

Tokenize data

In [15]:
def preprocess_text_classification(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

def preprocess_token_classification(examples):
    tokenized_inputs = tokenizer(examples['tokens'], padding="max_length", truncation=True, max_length=128, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if (word_id is None or word_id == tokenizer.eos_token_id) is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
data_seq = polemo.map(preprocess_text_classification, batched=True)
data_seq = data_seq.rename_column("target", "labels")
data_seq.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
                    
data_token = kpwr.map(preprocess_token_classification, batched=True)
data_token.set_format("torch", columns=["input_ids", "attention_mask", "labels"])                    

Map:   0%|          | 0/6573 [00:00<?, ? examples/s]

Map:   0%|          | 0/823 [00:00<?, ? examples/s]

Map:   0%|          | 0/2161 [00:00<?, ? examples/s]

Map:   0%|          | 0/2162 [00:00<?, ? examples/s]

Fine-tune models

In [13]:
training_args_seq = TrainingArguments(
    output_dir="./results/BERT/seq",
    num_train_epochs=100,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_dir="./logs/BERT/seq",
    save_total_limit=1,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch",
    save_strategy="epoch"
)

training_args_token = TrainingArguments(
    output_dir="./results/BERT/token",
    num_train_epochs=100,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_dir="./logs/BERT/token",
    save_total_limit=1,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch",
    save_strategy="epoch"
)

Sequence classification

In [None]:
trainer_seq = Trainer(
    model=model_seq_peft,
    args=training_args_seq,
    train_dataset=data_seq["train"],
    eval_dataset=data_seq["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer_seq.train()

Token classification

In [None]:
trainer_token = Trainer(
    model=model_token_peft,
    args=training_args_token,
    train_dataset=data_token["train"],
    eval_dataset=data_token["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer_token.train()

### GPT2

In [17]:
tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-medium", add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token

model_seq = AutoModelForSequenceClassification.from_pretrained("sdadas/polish-gpt2-medium", num_labels=polemo_n_cls, pad_token_id=tokenizer.pad_token_id)

model_token = AutoModelForTokenClassification.from_pretrained("sdadas/polish-gpt2-medium", num_labels=kpwr_n_cls, pad_token_id=tokenizer.pad_token_id)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at sdadas/polish-gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at sdadas/polish-gpt2-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load PEFT adapters

In [20]:
config_seq = LoraConfig(
    task_type="SEQ_CLS",
)

config_token = LoraConfig(
    task_type="TOKEN_CLS",
)

model_seq_peft = get_peft_model(model_seq, config_seq)
model_token_peft = get_peft_model(model_token, config_token)



Fine-tune models

In [21]:
training_args_seq = TrainingArguments(
    output_dir="./results/GPT2/seq",
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_dir="./logs/GPT2/seq",
    save_total_limit=1,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch",
    save_strategy="epoch"
)

training_args_token = TrainingArguments(
    output_dir="./results/GPT2/token",
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_dir="./logs/GPT2/token",
    save_total_limit=1,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch",
    save_strategy="epoch"
)



Sequence classification

In [None]:
trainer_seq = Trainer(
    model=model_seq_peft,
    args=training_args_seq,
    train_dataset=data_seq["train"],
    eval_dataset=data_seq["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer_seq.train()

Token classification

In [None]:
trainer_token = Trainer(
    model=model_token_peft,
    args=training_args_token,
    train_dataset=data_token["train"],
    eval_dataset=data_token["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer_token.train()