In [14]:
!pip install transformers
!pip install beautifulsoup4

import torch
from collections import Counter
torch.cuda.empty_cache()

from google.colab import drive 
drive.mount("/content/drive")

'''
PARAMETERS
'''

weighted_loss = True
use_html_articles = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split

from torch import nn
from transformers import Trainer
from torch.autograd import Variable

class MultilabelTrainer(Trainer):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)    

  def compute_loss(self, model, inputs, return_outputs=False):
    class_weights = np.array([10, 1.28, 8.33]) # Weighted by dividing 200 / counts of true labels in one initial dataset 
    class_weights = torch.from_numpy(class_weights).float().to("cuda")
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs[0]
    loss = nn.CrossEntropyLoss(weight=class_weights)
    return (loss(logits, labels.flatten()), outputs) if return_outputs else loss(logits, labels.flatten())

In [16]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed). Taken from https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python 
 
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf
 
        tf.random.set_seed(seed)
 
set_seed(1)

In [17]:
model_name = "TurkuNLP/bert-base-finnish-cased-v1" 
max_length = 512

In [18]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

loading file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/69c0c339871654aa7305fe47345f0b713e6973a476eb1cf5f200d557b6bad765.ee591817c6a7d736b63494878a337beccf9497af463ab8eb01d19bf5f7169026
loading file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/3583dbf83678cb60c5faaf0a07aa0d452fc4ec09aac87b8680027bf79b1a6270.e49785bf2de92e06a4d89026870d6979723c8e64cfc9311596ca5b9a3b56289e
loading file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/978f5aef1d382479fb5ee87b700be69b

In [19]:
import pandas as pd
from bs4 import BeautifulSoup

label_names = ["kritiikki", "kopiointi", "oma narratiivi"]

def read_data():
  dataset = pd.read_csv("/content/drive/My Drive/Data/BERT_data_mvframes.csv")
  if use_html_articles == False:
    dataset['content'] =  dataset['content'].apply(lambda x: BeautifulSoup(x).get_text())
  print(list(dataset['content'])[0])

  new_labels = []
  names = []
  for c in list(dataset['class']):
    if c == 1:
      new_labels.append(0)
      names.append("kritiikki")
    if c == 2:
      new_labels.append(1)
      names.append("kopiointi")
    if c == 4:
      new_labels.append(2)
      names.append("oma narratiivi")

  dataset['class_name'] = names
  dataset['class'] = new_labels
  documents = list(dataset['content']) 
  labels = new_labels
  labels_integer = []
  for l in labels:
    labels_integer.append(np.int64(l))
  labels = labels_integer
  return train_test_split(documents, labels, random_state=42), label_names, dataset
  
# call the function
(train_texts, valid_texts, train_labels, valid_labels), target_names, df = read_data()

Verkkouuutiset uutisoi tänään, että Perussuomalaisten Teuvo Hakkarainen puhui tiistaina eduskunnassa ulkomaalaislain käsittelyssä.
Hakkaraisen mielestä esityksen sisältämät asiat ovat positiivisia askelia, mutta eivät riittäviä.
Teuvo Hakkarainen.
Teuvo Hakkarainen sanoi:
”Valtaosa turvapaikkaturisteista on tullut ainakin kymmenen turvallisen maan läpi Suomeen, eikä heillä kotimaassakaan ole ollut konkreettista henkeen ja terveyteen kohdistuvaa uhkaa, vaikka kaikenlaisia tarinoita he ovatkin oppineet kertomaan.
Enimmäkseen he ovat ilmaisen sosiaaliturvan perässä reissaavia nuoria miehiä, joita ei kiinnosta rakentaa omaa isänmaataan. Heitä kiinnostaa siivestäminen.”
”Koska alun alkaenkaan he eivät täytä kansainvälistä suojelua koskevia kriteereitä, en tiedä, miksi heillä ylipäänsä pitäisi olla valitusoikeudet.
Valitusoikeus kuormittaa oikeuslaitostamme kohtuuttomasti. Se ensimmäinenkin hakemus pitäisi tehdä pikapäätöksenä rajalla.
Jos asiallisia henkilöpapereita ei ole, hakemusta ei pit

In [20]:

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [21]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
valid_dataset = Dataset(valid_encodings, valid_labels)

In [22]:
if torch.cuda.is_available():
  model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")
else:
  model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache"

In [23]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [24]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=8,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=250,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

using `logging_steps` to initialize `eval_steps` to 250
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [25]:
import torch
torch.cuda.empty_cache()
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def train_model(texts, y_training, model):
  train_encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
  train_dataset = Dataset(train_encodings, y_training)

  if weighted_loss:
    trainer = MultilabelTrainer(
      model=model,                        
      args=training_args,                 
      train_dataset=train_dataset,        
      eval_dataset=valid_dataset,        
      compute_metrics=compute_metrics,    
    )
  else:
    trainer = Trainer(
      model=model,                         # the instantiated Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=valid_dataset,          # evaluation dataset
      compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )

  print("Train counter")
  print(Counter(y_training))

  trainer.train()
  evaluate = trainer.evaluate()
  acc = evaluate_model(df[900:], model)
  return model, acc

def make_predictions(df, model):
  for index, row in df.iterrows():
    text = row['text']
    articleID = row['articleID']
    probs, class_name = get_prediction_probs(text, model)
    article_dict[float(torch.max(probs).item())] = articleID 
  return article_dict

def get_prediction_probs(text, model):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs, target_names[probs.argmax()]

def evaluate_model(test_df, model, eval_classwise=False):
  if eval_classwise:
    correct = 0
    predicted = []
    correct_perclass = [0, 0, 0]
    for index, row in test_df.iterrows():
        probs, label_name = get_prediction_probs(row['content'], model)
        predicted.append(label_name)
        if label_name == row['class_name']:
          correct += 1
          index = target_names.index(label_name)
          correct_perclass[index] += 1
    print(confusion_matrix(test_df['class_name'], predicted, labels=target_names))
    i = 0

    multiclass_accuracies = []
    for item in correct_perclass:
        print(target_names[i], " ", (item / list(test_df['class_name']).count(target_names[i])))
        if list(test_df['class_name']).count(target_names[i]) != 0:
            multiclass_accuracies.append((item / list(test_df['class_name']).count(target_names[i])))
        else:
            multiclass_accuracies.append(-1)
        i += 1
  else:
    correct = 0
    for index, row in test_df.iterrows():
        probs, label_name = get_prediction_probs(row['content'], model)
        if label_name == row['class_name']:
            correct += 1
    multiclass_accuracies = correct / len(test_df) 
  return multiclass_accuracies



def initial_train(df):
  if torch.cuda.is_available():
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")
  else:
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

  train_texts = list(df[0:200]['content'])
  valid_texts = list(df[900:]['content'])
  y = np.array(df['class'])
  y = np.array([int(x) for x in y])
  y_train , y_test = y[0:200] , y[900:]

  train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
  valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)
  
  train_dataset = Dataset(train_encodings, y_train)
  valid_dataset = Dataset(valid_encodings, y_test)

  if weighted_loss:
    trainer = MultilabelTrainer(
      model=model,                         # the instantiated Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=valid_dataset,          # evaluation dataset
      compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
  else:
    trainer = Trainer(
      model=model,                         # the instantiated Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=valid_dataset,          # evaluation dataset
      compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )


  trainer.train() 
  evaluation = trainer.evaluate()
  acc = evaluate_model(df[900:], model)
  return model, acc

def active_learning_step(df, model):
  article_dict = {}
  correct = 0
  for index, row in df[200:900].iterrows():
    text = row['content']
    articleID = row['id']
    probs, label_name = get_prediction_probs(text, model)
    if label_name == row['class_name']:
      correct += 1
    article_dict[float(torch.max(probs).item())] = articleID

  article_dict = dict(sorted(article_dict.items(), key = lambda k: k[0] ))  
  texts = []
  y_train = []
  for k in list(article_dict.keys())[0:50]:
    row = df.loc[df['id'] == article_dict[k]]
    text = row['content'].values[0]
    texts.append(text)
    label = int(row['class']) 
    y_train.append(label)

  model, acc = train_model(texts, y_train, model)
  acc = evaluate_model(df[900:], model)
  return model, acc 


acc_scores_normal = []
acc_scores_active = []
 
for i in range(0, 5):
  initial_trained = False
  print("Round ", str(i))
  
  df=df.iloc[np.random.permutation(df.index)].reset_index(drop=True)

  print("Train with the initial dataset")

  if initial_trained == False:
    model, acc = initial_train(df)
    print("Initial accuracy: ", acc)
    model_path = "bertmodel"
    model.save_pretrained(model_path) 
    initial_trained = True
  else:
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(target_names)).to("cuda")

  print("Simulate active learning")

  model, acc = active_learning_step(df, model)
  model, acc = active_learning_step(df, model)
  acc_scores_active.append(acc)
    
  print("Active learning accuracies")
  print(acc_scores_active)

  print("Train with a random sample")
  model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(target_names)).to("cuda")


  model, acc = train_model(list(df['content'][200:300]), list(df['class'][200:300]), model)
  acc_scores_normal.append(acc)
  print("Random sample training accuracies ")
  print(acc_scores_normal)

loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache"

Round  0
Train with the initial dataset


loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/276bf5f0d95b31fc0ed72ef6e2e1b771f2265351a4d322667fd8c73d8473d3fc.3d524bdc756dfbb2ba6c3c3a18e4e2afcc84034db29556b337605e9f8c39c2c2
Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Configuration saved in bertmodel/config.json


Initial accuracy:  0.85


Model weights saved in bertmodel/pytorch_model.bin


Simulate active learning


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 19, 2: 17, 0: 14})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 26, 2: 17, 0: 7})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file bertmodel/config.json
Model config BertConfig {
  "_name_or_path": "TurkuNLP/bert-base-finnish-cased-v1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file bertmodel/pytorch_model.bin


Active learning accuracies
[0.87]
Train with a random sample


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at bertmodel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
***** Running training *****
  Num examples = 100
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 104


Train counter
Counter({1: 76, 2: 16, 0: 8})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache"

Random sample training accuracies 
[0.83]
Round  1
Train with the initial dataset


loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/276bf5f0d95b31fc0ed72ef6e2e1b771f2265351a4d322667fd8c73d8473d3fc.3d524bdc756dfbb2ba6c3c3a18e4e2afcc84034db29556b337605e9f8c39c2c2
Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Configuration saved in bertmodel/config.json


Initial accuracy:  0.79


Model weights saved in bertmodel/pytorch_model.bin


Simulate active learning


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 38, 0: 6, 2: 6})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 21, 2: 17, 0: 12})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file bertmodel/config.json
Model config BertConfig {
  "_name_or_path": "TurkuNLP/bert-base-finnish-cased-v1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file bertmodel/pytorch_model.bin


Active learning accuracies
[0.87, 0.78]
Train with a random sample


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at bertmodel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
***** Running training *****
  Num examples = 100
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 104


Train counter
Counter({1: 74, 2: 17, 0: 9})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache"

Random sample training accuracies 
[0.83, 0.79]
Round  2
Train with the initial dataset


loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/276bf5f0d95b31fc0ed72ef6e2e1b771f2265351a4d322667fd8c73d8473d3fc.3d524bdc756dfbb2ba6c3c3a18e4e2afcc84034db29556b337605e9f8c39c2c2
Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Configuration saved in bertmodel/config.json


Initial accuracy:  0.74


Model weights saved in bertmodel/pytorch_model.bin


Simulate active learning


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({2: 28, 1: 11, 0: 11})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 31, 2: 14, 0: 5})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file bertmodel/config.json
Model config BertConfig {
  "_name_or_path": "TurkuNLP/bert-base-finnish-cased-v1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file bertmodel/pytorch_model.bin


Active learning accuracies
[0.87, 0.78, 0.8]
Train with a random sample


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at bertmodel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
***** Running training *****
  Num examples = 100
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 104


Train counter
Counter({1: 74, 2: 20, 0: 6})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache"

Random sample training accuracies 
[0.83, 0.79, 0.76]
Round  3
Train with the initial dataset


loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/276bf5f0d95b31fc0ed72ef6e2e1b771f2265351a4d322667fd8c73d8473d3fc.3d524bdc756dfbb2ba6c3c3a18e4e2afcc84034db29556b337605e9f8c39c2c2
Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Configuration saved in bertmodel/config.json


Initial accuracy:  0.8


Model weights saved in bertmodel/pytorch_model.bin


Simulate active learning


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 21, 0: 15, 2: 14})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 32, 2: 16, 0: 2})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file bertmodel/config.json
Model config BertConfig {
  "_name_or_path": "TurkuNLP/bert-base-finnish-cased-v1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file bertmodel/pytorch_model.bin


Active learning accuracies
[0.87, 0.78, 0.8, 0.82]
Train with a random sample


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at bertmodel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
***** Running training *****
  Num examples = 100
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 104


Train counter
Counter({1: 79, 2: 12, 0: 9})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache"

Random sample training accuracies 
[0.83, 0.79, 0.76, 0.8]
Round  4
Train with the initial dataset


loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/276bf5f0d95b31fc0ed72ef6e2e1b771f2265351a4d322667fd8c73d8473d3fc.3d524bdc756dfbb2ba6c3c3a18e4e2afcc84034db29556b337605e9f8c39c2c2
Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Configuration saved in bertmodel/config.json


Initial accuracy:  0.84


Model weights saved in bertmodel/pytorch_model.bin


Simulate active learning


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 21, 2: 17, 0: 12})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


***** Running training *****
  Num examples = 50
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Train counter
Counter({1: 19, 0: 18, 2: 13})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


loading configuration file bertmodel/config.json
Model config BertConfig {
  "_name_or_path": "TurkuNLP/bert-base-finnish-cased-v1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file bertmodel/pytorch_model.bin


Active learning accuracies
[0.87, 0.78, 0.8, 0.82, 0.83]
Train with a random sample


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at bertmodel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
***** Running training *****
  Num examples = 100
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 104


Train counter
Counter({1: 74, 2: 18, 0: 8})


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 250
  Batch size = 8


Random sample training accuracies 
[0.83, 0.79, 0.76, 0.8, 0.83]


In [26]:
print(sum(acc_scores_normal) / len(acc_scores_normal))
print(sum(acc_scores_active) / len(acc_scores_active))


0.8019999999999999
0.82
