In [2]:
import pandas as pd
import pyarrow.parquet as pq
import requests
import json

import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
import os
from google.cloud import translate_v2 as translate

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'tdde09-a46780da2eb6.json'

translate_client = translate.Client()

## This code block is used for creating the fine tuning data.

In [4]:
df = pd.read_parquet('train-00000-of-00001.parquet')

sentences = df['text']
tags = df['labels']

iter = 0
def translate_text(text):
    try:
        result = translate_client.translate(text, source_language='en', target_language='sv')
        translated_text = result['translatedText']
        return translated_text
    except:
        return "Translation Error"

translated_sentences = sentences.apply(translate_text)

translated_df = pd.DataFrame({
    'sentence_column_name': translated_sentences,
    'tag_column_name': tags
})

translated_df.to_csv('translated_train-00000-of-00001.csv', index=False)

## This code block is used for creating the test data

In [5]:
df = pd.read_parquet('test-00000-of-00001.parquet')

sentences = df['text']
tags = df['labels']

def translate_text(text):
    try:
        result = translate_client.translate(text, source_language='en', target_language='sv')
        translated_text = result['translatedText']
        return translated_text
    except:
        return "Translation Error"

translated_sentences = sentences.apply(translate_text)

translated_df = pd.DataFrame({
    'sentence_column_name': translated_sentences,
    'tag_column_name': tags
})

translated_df.to_csv('translated_test-00000-of-00001.csv', index=False)

## Importing swe-BERT for initial training

In [None]:
# collab command to install transformers
!pip install transformers
!pip install tqdm

In [None]:
from torch.utils.data import Dataset

class SNLIDataset(Dataset):

    def __init__(self, filename, max_size=None):
        super().__init__()
        self.xs = []
        self.ys = []
        with open(filename, encoding='utf-8') as source:
            for i, line in enumerate(source):
                if max_size and i >= max_size:
                    break
                sentence, sentiment_value = line.rstrip().split('\t') # Delimeter to be chosen
                self.xs.append(sentence)
                self.ys.append(['negative', 'neutral', 'positive'].index(sentiment_value))

    def __getitem__(self, idx):
        return self.xs[idx], self.ys[idx]

    def __len__(self):
        return len(self.xs)

In [None]:
train_dataset = SNLIDataset('translated_train-00000-of-00001.csv', max_size=40000)
test_dataset = SNLIDataset('translated_test-00000-of-00001.csv')
data = train_dataset[120:130]

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-swedish-cased', do_lower_case=True)

### Collate function

To be adjusted: is to be done with state-of-the-art document classification for BERT, doc-BERT

In [None]:
def our_collate_fn(data):
    x = [a[0] for a in data]
    y = [a[1] for a in data]
    tokenized = tokenizer(text=x, padding='longest', return_tensors='pt')

    # done with the help of 
    # https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c
    # mask with prob 15%

    rand = torch.rand(tokenized['input_ids'].shape)
    # where the random array is less than 0.15, we set true

    mask_arr = (rand < 0.15) * (tokenized['input_ids'] != 101) * (tokenized['input_ids'] != 102)

    # create selection from mask_arr
    selection = torch.flatten((mask_arr[0]).nonzero()).tolist()

    # apply selection index to inputs.input_ids, adding MASK tokens
    tokenized['input_ids'][0, selection] = 103 # here masking should not only mask to [MASK]=103 but instead do

    # 80%: [MASK], 10% ["random"], 10% ["unchanged"]
    # https://arxiv.org/pdf/1810.04805.pdf courtesy of this article on doc-BERT


    return tokenized['input_ids'].to(device), torch.as_tensor(y).to(device), tokenized['attention_mask'].to(device)

### Fine tuning training

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

tokenized_train_data = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=our_collate_fn)
tokenized_valid_data = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=our_collate_fn)
# print(tokenized_train_data)
# for batch in tokenized_train_data:
#     for sent_pair in batch[0]:
#       print(sent_pair)
#     print(batch)
#     break

model = BertForSequenceClassification.from_pretrained('bert-base-swedish-cased', num_labels=3)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# softmax = torch.nn.Softmax(dim=1)
epochs = 2

for _ in range(epochs):
  model.train()

  with tqdm(total=len(train_dataset)) as pbar:

    for bindex, batch in enumerate(tokenized_train_data):
      optimizer.zero_grad()
      # forward pass
      train_output = model(batch[0], labels=batch[1], attention_mask=batch[2])
      # backward pass
      train_output.loss.backward()
      optimizer.step()
      pbar.update(len(batch[0]))


    model.eval()
    valids = []
    for batch in tokenized_valid_data:
      with torch.no_grad():
        # forward pass
        eval_output = model(batch[0], attention_mask=batch[2])
        guess = torch.argmax(eval_output.logits, dim=1)

        
        valids.append(sum(guess == batch[1])/len(batch[1]))
    print('Accuracy: {}'.format(sum(valids)/len(valids)))
