In [None]:
!pip install transformers
!pip install datasets

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = BertForSequenceClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels = 1)

In [None]:
import json
from sklearn.model_selection import train_test_split

with open('/content/drive/MyDrive/ResearchProject/083/reviews_083_training.json') as f: #specify training data path
    data = json.load(f) #[{"text": "blabla", "pros": ["bla", "bla"], "cons": ["bla"]}, {...}]

new_data = []
new_labels = []
for review in data:
        for pro in review["pros"]:
          temp = (review["text"], pro)
          new_data.append(temp)
          new_labels.append(1)
        for con in review["cons"]:
          temp = ()
          temp = (review["text"], con)
          new_data.append(temp)
          new_labels.append(0)
print(new_data[:10])
print(new_labels[:10])

In [None]:
#adapted from https://huggingface.co/docs/transformers/v4.19.2/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.encode_plus
#and https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b
#...and a few more resources

In [None]:
#split into train and test, but first generate a seed (at random) and memorize it,
#so we can apply the exact same split to the labels
import random
less_random = n = random.randint(0,1000)


train, test = train_test_split(new_data,test_size=0.15,random_state=less_random) 
train_labels, test_labels = train_test_split(new_labels,test_size=0.15,random_state=less_random) 


train_batch = tokenizer.batch_encode_plus(
      batch_text_or_text_pairs = train, #the sentence + the aspect
      add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
      padding="max_length",           # Max length to truncate/pad, 512 is Max for BERT and we have longer reviews than that
      truncation="only_first",        # if the review is too long, truncate the review, _not_ the aspect
      return_attention_mask = True
)


test_batch = tokenizer.batch_encode_plus(
       batch_text_or_text_pairs = test, #the sentence + the aspect
       add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
       padding="max_length",           # Max length to truncate/pad, 512 is Max for BERT and we have longer reviews than that
       truncation="only_first",        # if the review is too long, truncate the review, _not_ the aspect
       return_attention_mask = True
)


print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

print(train[0])
print(train_batch['input_ids'][0])

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):    
    def __init__(self, encodings, labels=None):          
        self.encodings = encodings        
        self.labels = labels
     
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(train_batch, train_labels)
test_dataset = Dataset(test_batch, test_labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer # source https://huggingface.co/docs/transformers/training
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy} 

training_args = TrainingArguments(
    output_dir="/content/Bertje",
    evaluation_strategy="steps",
    eval_steps=1000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
    save_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.save_model()
#uncomment to download model in one go
# !zip -r /content/bertje.zip /content/Bertje
# files.download('/content/bertje.zip')

# Load model

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load trained model
model_path = "/content/Bertje"
model = BertForSequenceClassification.from_pretrained(model_path)

tokenizer = BertTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

# Evaluation


In [None]:
#run predictions for entire file
import json
from transformers import Trainer

test_trainer = Trainer(model)

with open('/content/drive/MyDrive/ResearchProject/083/Bertje/newModel/result_wo_class.json') as file: #specify file with generated aspects
  test_data = json.load(file)

#edit for specific data format
def predict_review(review):
  review["predictions"] = {}
  for aspect in review['generated_aspects']:
    X_test = [(review['text'], aspect['aspect'])]
    X_test_tokenized = tokenizer(X_test, padding=True, truncation="only_first", max_length=512)
    pred,_,_ = test_trainer.predict(Dataset(X_test_tokenized))
    review['predictions'][aspect["aspect"]] = round(pred.flatten().tolist()[0])

for review in test_data:
  predict_review(review)

with open('/content/drive/MyDrive/ResearchProject/083/Bertje/results.json', 'w') as outfile:
  json.dump(test_data, outfile)


In [None]:
#run a single prediction
test_trainer = Trainer(model)

test_text = Dataset(tokenizer([("beeld is echt verschrikkelijk goed", "beeld")], padding=True, truncation="only_first", max_length=512))
raw_pred_one, _, _ = test_trainer.predict(test_text)
raw_pred_one.flatten()