In [54]:
import datasets
import numpy as np
import plotly.offline as py
import plotly.express as px

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline, AutoModelForTokenClassification, DataCollatorForTokenClassification
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

## Bert Model

### Dataset

In [2]:
text_path = "../data/text_annotations.json"
data_files = {
    'text': text_path
}
text_data = datasets.load_dataset('json', data_files=data_files)
text_data = text_data["text"].train_test_split(test_size=0.2)
text_data_val_test = text_data["test"].train_test_split(test_size=0.5)
text_data["test"] = text_data_val_test["train"]
text_data["val"] = text_data_val_test["test"]

word_path = "../data/token_annotations.json"
data_files = {
    'word': word_path
}
word_data = datasets.load_dataset('json', data_files=data_files)
word_data = word_data["word"].train_test_split(test_size=0.2)
word_data_val_test = word_data["test"].train_test_split(test_size=0.5)
word_data["test"] = word_data_val_test["train"]
word_data["val"] = word_data_val_test["test"]

text_data, word_data

(DatasetDict({
     train: Dataset({
         features: ['text', 'sentiment'],
         num_rows: 200
     })
     test: Dataset({
         features: ['text', 'sentiment'],
         num_rows: 25
     })
     val: Dataset({
         features: ['text', 'sentiment'],
         num_rows: 25
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['id', 'ner_tags', 'tokens'],
         num_rows: 200
     })
     test: Dataset({
         features: ['id', 'ner_tags', 'tokens'],
         num_rows: 25
     })
     val: Dataset({
         features: ['id', 'ner_tags', 'tokens'],
         num_rows: 26
     })
 }))

### Base Bert

In [3]:
text_labels = sorted(set(text_data['train']['sentiment']))

text_label2id = {label: i for i, label in enumerate(text_labels)}
text_id2label = {i: label for i, label in enumerate(text_labels)}

In [4]:
text_label2id

{'Amb': 0, 'Minus': 1, 'Plus': 2, 'Zero': 3}

In [5]:
word_id2label = {
    0: 'Brak etykiety',
    1: 'B-Obsluga',
    2: 'I-Obsluga',
    3: 'B-Widoki',
    4: 'I-Widoki',
    5: 'B-Jedzenie',
    6: 'I-Jedzenie',
    7: 'B-Sen',
    8: 'I-Sen',
}
word_label2id = {v: k for k, v in word_id2label.items()}

In [6]:
tokenizer = AutoTokenizer.from_pretrained('allegro/herbert-base-cased')

In [7]:
def tokenize_words(words):
    tokenized_words = tokenizer(words["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(words["ner_tags"]):
        word_ids = tokenized_words.word_ids(batch_index=i) # map tokens to input word; get tokens if i-th word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_words["labels"] = labels
    return tokenized_words


In [8]:
tokenized_words = word_data.map(tokenize_words, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [9]:
tokenized_words

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
    val: Dataset({
        features: ['id', 'ner_tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 26
    })
})

In [10]:
# tokenize
def tokenize_sequnce(sequence):
    tokens = tokenizer(sequence['text'], padding='max_length', truncation=True, max_length=512)
    if isinstance(sequence["sentiment"], str): # when not batched
        tokens['label'] = text_label2id[sequence["sentiment"]]
    else:
        tokens['label'] = [text_label2id[l] for l in sequence["sentiment"]]
    return tokens

text_data = text_data.map(tokenize_sequnce)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [11]:
text_data

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 25
    })
    val: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 25
    })
})

In [12]:
sequence_model = AutoModelForSequenceClassification.from_pretrained(
    'allegro/herbert-base-cased', 
    num_labels=len(text_labels), 
    id2label=text_id2label, 
    label2id=text_label2id
)
token_model = AutoModelForTokenClassification.from_pretrained(
    'allegro/herbert-base-cased', 
    num_labels=len(word_label2id), 
    id2label=word_id2label,
    label2id=word_label2id
)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
sequence_pipeline = pipeline('text-classification', model=sequence_model, tokenizer=tokenizer, device=0)
token_pipeline = pipeline('ner', model=token_model, tokenizer=tokenizer, device=0)

predicted_sequence = sequence_pipeline(text_data['test']['text'])
predicted_token = token_pipeline(text_data['test']['text'])

In [14]:
# acc
sequence_test_array = np.asarray(text_data['test']['sentiment'])
predicted_sequence_array = np.asarray([p['label'] for p in predicted_sequence])
sequence_acc = round(sum(sequence_test_array == predicted_sequence_array)*100/len(sequence_test_array), 2)

token_test_array = np.asarray(word_data['test']['ner_tags'])
predicted_token_array = np.asarray([[word_label2id[p['entity']] for p in pred] for pred in predicted_token])
token_acc = round(sum(token_test_array == predicted_token_array)*100/len(sequence_test_array), 2)

print(f"Sequence accuracy: {sequence_acc}")
print(f"Token accuracy: {token_acc}")

Sequence accuracy: 8.0
Token accuracy: 0.0


  token_test_array = np.asarray(word_data['test']['ner_tags'])
  predicted_token_array = np.asarray([[word_label2id[p['entity']] for p in pred] for pred in predicted_token])


### Peft

In [15]:
lora_sequence_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=64,
    lora_alpha=1,
    lora_dropout=0.1,
)

peft_sequence_model = get_peft_model(sequence_model, lora_sequence_config)

In [16]:
lora_token_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    r=64,
    lora_alpha=1,
    lora_dropout=0.1,
)

peft_token_model = get_peft_model(token_model, lora_token_config)

In [17]:
print("PEFT Sequence")
peft_sequence_model.print_trainable_parameters()

PEFT Sequence
trainable params: 2,362,372 || all params: 126,808,328 || trainable%: 1.8629


In [18]:
print("PEFT Token")
peft_token_model.print_trainable_parameters()

PEFT Token
trainable params: 2,366,217 || all params: 126,225,426 || trainable%: 1.8746


In [19]:
def compute_acc(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()*100}

In [20]:
text_data["train"]

Dataset({
    features: ['text', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 200
})

In [21]:
seq_trainer = Trainer(
    model=peft_sequence_model,
    args=TrainingArguments(
        output_dir="bert-lora-seq",
        learning_rate=2e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=10,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none",
    ),
    train_dataset=text_data["train"],
    eval_dataset=text_data["val"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_acc,
)



In [22]:
seq_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.328552,44.0
2,No log,1.194152,64.0
3,No log,0.853324,68.0
4,No log,1.121706,72.0
5,No log,1.017162,68.0
6,No log,1.277632,76.0
7,No log,1.402413,68.0
8,No log,1.401835,72.0
9,No log,1.418864,72.0
10,0.508300,1.41415,72.0


TrainOutput(global_step=500, training_loss=0.5083127746582031, metrics={'train_runtime': 106.8426, 'train_samples_per_second': 18.719, 'train_steps_per_second': 4.68, 'total_flos': 540745973760000.0, 'train_loss': 0.5083127746582031, 'epoch': 10.0})

In [23]:
def compute_token_acc(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)
    return {"accuracy": (predictions == labels).mean()*100}

In [24]:
token_trainer = Trainer(
    model=peft_token_model,
    args=TrainingArguments(
        output_dir="bert-lora-token",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=10,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none",
    ),
    train_dataset=tokenized_words["train"],
    eval_dataset=tokenized_words["val"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_token_acc,
)



In [25]:
token_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.089645,2.614183
2,No log,1.953873,7.083834
3,No log,1.83579,12.176983
4,No log,1.735276,15.609976
5,No log,1.651002,17.457933
6,No log,1.582848,18.524639
7,No log,1.53075,19.042969
8,No log,1.49402,19.328425
9,No log,1.47241,19.501202
10,1.719000,1.465213,19.538762


TrainOutput(global_step=500, training_loss=1.7190421142578125, metrics={'train_runtime': 53.0279, 'train_samples_per_second': 37.716, 'train_steps_per_second': 9.429, 'total_flos': 272732747975136.0, 'train_loss': 1.7190421142578125, 'epoch': 10.0})

### Predict

In [26]:
seq_predictions = seq_trainer.predict(text_data["test"])

In [29]:
seq_predictions

PredictionOutput(predictions=array([[-0.08206324,  2.8633885 , -2.6955104 , -2.4244592 ],
       [ 0.09152935, -0.4189257 ,  1.0482912 , -2.5614355 ],
       [ 1.0190253 , -1.2913119 ,  0.57965034, -1.7660425 ],
       [ 0.50593704,  2.707549  , -3.444747  , -0.9763132 ],
       [ 0.4226987 ,  2.748864  , -3.8817022 , -0.14512075],
       [ 0.17324705,  2.6604574 , -3.5150394 , -0.55188316],
       [ 1.0507922 ,  1.4998915 , -3.2912328 ,  0.50560254],
       [-0.12397951,  2.8571951 , -3.2021635 , -1.2914268 ],
       [ 0.17587446,  1.8362578 , -2.7222476 , -1.3539094 ],
       [ 1.036243  , -2.830709  ,  2.6067288 , -1.69327   ],
       [ 0.56615525, -1.6708738 ,  2.010278  , -2.8154223 ],
       [ 0.07839691,  2.1204684 , -1.8324739 , -2.8574026 ],
       [ 0.5881994 ,  2.1904929 , -2.587671  , -1.6785966 ],
       [ 0.57423264, -0.85571784, -1.4680549 ,  1.6027832 ],
       [ 0.38120973,  0.9435732 , -0.9286149 , -2.3366377 ],
       [ 0.38439626,  0.96780354, -0.9803209 , -2.339350

In [27]:
predictions = np.argmax(seq_predictions.predictions, axis=1)
test_labels = np.array(text_data['test']['label'])
accuracy = np.mean(predictions == test_labels)
print(f"Fine-tuned SEQ CLS accuracy: {accuracy*100}%")

Fine-tuned SEQ CLS accuracy: 80.0%


In [66]:
tsne = TSNE(n_components=2, perplexity=5)
tsne_results = tsne.fit_transform(seq_predictions.predictions)

In [74]:
fig = px.scatter(
    x=tsne_results[:,0],
    y=tsne_results[:,1],
    color=text_data["test"]["label"],
)
fig.show()