In [1]:
import torch
import Models

nb_bert_pipe = Models.get_nb_bert()
mbert_pipe = Models.get_mbert()
nor_bert_pipe = Models.get_nor_bert()


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [8]:
import json

def number_sentiment(sentiment):
    if sentiment == 'Positive':
        return 1
    return 0

with open('./norbench/polarity.json') as polarity_data:
    polarity_array = json.load(polarity_data)[:100]
    train_texts = [datapoint['text'] for datapoint in polarity_array]
    labels = [number_sentiment(datapoint['label']) for datapoint in polarity_array]

In [32]:
from sklearn.model_selection import train_test_split
train_texts, val_test_texts, train_labels, val_test_labels = train_test_split(inputs, labels, test_size=.2)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_test_texts, val_test_labels, test_size=.5)


from transformers import AutoTokenizer
nor_bert_tokenizer = AutoTokenizer.from_pretrained("ltgoslo/norbert")
nb_bert_tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')
mbert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')


nor_bert_train_encodings = nor_bert_tokenizer(train_texts, truncation=True, padding=True)
nb_bert_train_encodings = nb_bert_tokenizer(train_texts, truncation=True, padding=True)
mbert_train_encodings = mbert_tokenizer(train_texts, truncation=True, padding=True)

nor_bert_val_encodings = nor_bert_tokenizer(val_texts, truncation=True, padding=True)
nb_bert_val_encodings = nb_bert_tokenizer(val_texts, truncation=True, padding=True)
mbert_val_encodings = mbert_tokenizer(val_texts, truncation=True, padding=True)

nor_bert_test_encodings = nor_bert_tokenizer(test_texts, truncation=True, padding=True)
nb_bert_test_encodings = nb_bert_tokenizer(test_texts, truncation=True, padding=True)
mbert_test_encodings = mbert_tokenizer(test_texts, truncation=True, padding=True)


class SentinentPolarityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


nor_bert_train_dataset = SentinentPolarityDataset(nor_bert_train_encodings, train_labels)
nb_bert_train_dataset = SentinentPolarityDataset(nb_bert_train_encodings, train_labels)
mbert_train_dataset = SentinentPolarityDataset(mbert_train_encodings, train_labels)


ValueError: Found input variables with inconsistent numbers of samples: [100, 16]

In [10]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model=nor_bert_pipe.model
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32922, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [29]:
nor_bert_pipe('Dette er en positiv setning')

[{'label': 'LABEL_1', 'score': 0.615852415561676}]

In [30]:
nor_bert_pipe('Kjempefint vidunderlig bra')

[{'label': 'LABEL_1', 'score': 0.5620705485343933}]

In [31]:
nor_bert_pipe('Dårlig elendig setning')

[{'label': 'LABEL_1', 'score': 0.5992310047149658}]