In [1]:
import pandas as pd
import pyarrow.parquet as pq
import requests
import json

import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
import os
from google.cloud import translate_v2 as translate

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'tdde09-a46780da2eb6.json'

translate_client = translate.Client()

## This code block is used for creating the fine tuning data.

In [3]:
df = pd.read_parquet('train-00000-of-00001.parquet')

sentences = df['text']
tags = df['labels']

iter = 0
def translate_text(text):
    try:
        result = translate_client.translate(text, source_language='en', target_language='sv')
        translated_text = result['translatedText']
        return translated_text
    except:
        return "Translation Error"

translated_sentences = sentences.apply(translate_text)

translated_df = pd.DataFrame({
    'sentence_column_name': translated_sentences,
    'tag_column_name': tags
})

translated_df.to_csv('translated_train-00000-of-00001.csv', index=False, sep="{")

## This code block is used for creating the test data

In [4]:
df = pd.read_parquet('test-00000-of-00001.parquet')

sentences = df['text']
tags = df['labels']

def translate_text(text):
    try:
        result = translate_client.translate(text, source_language='en', target_language='sv')
        translated_text = result['translatedText']
        return translated_text
    except:
        return "Translation Error"

translated_sentences = sentences.apply(translate_text)

translated_df = pd.DataFrame({
    'sentence_column_name': translated_sentences,
    'tag_column_name': tags
})

translated_df.to_csv('translated_test-00000-of-00001.csv', index=False, sep="{")

## Importing swe-BERT for initial training

In [1]:
# collab command to install transformers
!pip install transformers
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://us

In [30]:
from torch.utils.data import Dataset

class SNLIDataset(Dataset):

    def __init__(self, filename, max_size=None):
        super().__init__()
        self.xs = []
        self.ys = []
        with open(filename, encoding='utf-8') as source:
            for i, line in enumerate(source):
                if i == 0:
                  continue
                # print(line)
                if max_size and i >= max_size:
                    break
                sentence, sentiment_value = line.rstrip().split('{') # Delimeter to be chosen
                # print(sentence)
                self.xs.append(sentence)
                self.ys.append(int(sentiment_value)) # make sure negative/neutral/positive is labelled correct

    def __getitem__(self, idx):
        return self.xs[idx], self.ys[idx]

    def __len__(self):
        return len(self.xs)

In [41]:
train_dataset = SNLIDataset('translated_train-00000-of-00001.csv', max_size=400)
test_dataset = SNLIDataset('translated_test-00000-of-00001.csv')
data = train_dataset[121]
print(data)

('`` De problem som identifierats av Stonesoft påverkar en rad innehållsinspektionstekniker.', 1)


In [13]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('KB/bert-base-swedish-cased', do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/399k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

In [32]:
tokenized = tokenizer(text=data[0], padding='longest', return_tensors='pt')
print(data[0])
print(tokenized.input_ids)
print(tokenized)

`` De problem som identifierats av Stonesoft påverkar en rad innehållsinspektionstekniker.
tensor([[    2,     1,     1,   102,  1333,    67, 16402,   454,    65, 39125,
           140, 12331,  7502,    59,  1207,  3821, 17064,  2149, 24677,     7,
             3]])
{'input_ids': tensor([[    2,     1,     1,   102,  1333,    67, 16402,   454,    65, 39125,
           140, 12331,  7502,    59,  1207,  3821, 17064,  2149, 24677,     7,
             3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### Collate function

To be adjusted: is to be done with state-of-the-art document classification for BERT, doc-BERT

In [37]:
def our_collate_fn(data):
    x = [a[0] for a in data]
    y = [a[1] for a in data]
    tokenized = tokenizer(text=x, padding='longest', return_tensors='pt')

    # done with the help of 
    # https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c
    # mask with prob 15%

    # rand = torch.rand(tokenized['input_ids'].shape)
    # where the random array is less than 0.15, we set true

    # mask_arr = (rand < 0.15) * (tokenized['input_ids'] != 101) * (tokenized['input_ids'] != 102)

    # create selection from mask_arr
    # selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
    # correct_words = tokenized['input_ids'][0, selection]
    # random_words = 


    # apply selection index to inputs.input_ids, adding MASK tokens
    # tokenized['input_ids'][0, selection] = 4 # here masking should not only mask to [MASK]=103 but instead do

    # 80%: [MASK], 10% ["random"], 10% ["unchanged"]
    # https://arxiv.org/pdf/1810.04805.pdf courtesy of this article on doc-BERT

    return tokenized['input_ids'].to(device), torch.as_tensor(y).to(device), tokenized['attention_mask'].to(device)

### Fine tuning training

In [42]:
from torch.utils.data import DataLoader
from tqdm import tqdm

tokenized_train_data = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=our_collate_fn)
tokenized_valid_data = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=our_collate_fn)
# print(tokenized_train_data)
# for batch in tokenized_train_data:
#     for sent_pair in batch[0]:
#       print(sent_pair)
#     print(batch)
#     break

model = BertForSequenceClassification.from_pretrained('KB/bert-base-swedish-cased', num_labels=3)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# softmax = torch.nn.Softmax(dim=1)
epochs = 1

for _ in range(epochs):
  model.train()

  with tqdm(total=len(train_dataset)) as pbar:

    for bindex, batch in enumerate(tokenized_train_data):
      optimizer.zero_grad()
      # forward pass
      train_output = model(batch[0], labels=batch[1], attention_mask=batch[2])
      # backward pass
      train_output.loss.backward()
      optimizer.step()
      pbar.update(len(batch[0]))


    model.eval()
    valids = []
    for batch in tokenized_valid_data:
      with torch.no_grad():
        # forward pass
        eval_output = model(batch[0], attention_mask=batch[2])
        guess = torch.argmax(eval_output.logits, dim=1)

        
        valids.append(sum(guess == batch[1])/len(batch[1]))
    print('Accuracy: {}'.format(sum(valids)/len(valids)))


Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedi

Accuracy: 0.5365084409713745



