In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForMaskedLM, set_seed
from sklearn import svm
from sklearn import metrics
import numpy as np

In [3]:
dataset = load_dataset("GroNLP/dutch-cola")

In [4]:
train_df = pd.DataFrame.from_dict(dataset["train"])
dev_df = pd.DataFrame.from_dict(dataset["validation"])
test_df = pd.DataFrame.from_dict(dataset["test"])


In [None]:
train_df

In [None]:
# Initiate Model
model_name = 'GroNLP/bert-base-dutch-cased' # or other model if preferred
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [7]:
train_df = train_df.sample(frac=1, random_state=42) # shuffle train data

In [None]:
# Create embeddings of train data
train_cls_embeddings = {
  1: [],
  2: [],
  3: [],
  4: [],
  5: [],
  6: [],
  7: [],
  8: [],
  9: [],
  10: [],
  11: [],
  12: []
}
for line in tqdm(train_df["Sentence"]):
  tokenized_text = tokenizer(line, return_tensors="pt")
  with torch.no_grad():
    line_embedding = model(**tokenized_text, output_hidden_states=True) # extract embedding for sentence

  for i in range(1,13):
    train_cls_embeddings[i].append(line_embedding.hidden_states[i][:, -1, :]) # store embedding

In [None]:
# Create embeddings of test data
test_cls_embeddings = {
  1: [],
  2: [],
  3: [],
  4: [],
  5: [],
  6: [],
  7: [],
  8: [],
  9: [],
  10: [],
  11: [],
  12: []
}
for line in tqdm(test_df["Sentence"]):
  tokenized_text = tokenizer(line, return_tensors="pt")
  with torch.no_grad():
    line_embedding = model(**tokenized_text, output_hidden_states=True) # extract embedding for sentence

  for i in range(1,13):
    test_cls_embeddings[i].append(line_embedding.hidden_states[i][:, -1, :]) # store embedding

In [None]:
for layer in range(1,13):
  print(layer)
  print("===============")

  # Create numpy objects of embeddings
  X_train = [tensor.numpy() for tensor in train_cls_embeddings[layer]]
  X_test = [tensor.numpy() for tensor in test_cls_embeddings[layer]] # set to dev for testing accuracy

  # extract labels from CoLA
  y_train = train_df['Acceptability']
  y_test = test_df['Acceptability']

  # initiate SVC model
  clf = svm.SVC(kernel='linear')

  # train the model
  clf.fit(X_train, y_train)

  print("Model trained.")

  # predict labels
  y_pred = clf.predict(X_test)

  ac_score = metrics.accuracy_score(y_test, y_pred)
  rc_score = metrics.recall_score(y_test, y_pred)
  pr_score = metrics.precision_score(y_test, y_pred)
  f1_score = metrics.f1_score(y_test, y_pred)

  print("---------------")
  print(f"Accuracy: {ac_score:.3f}\nRecall: {rc_score:.3f}\nPrecision: {pr_score:.3f}\nF1: {f1_score:.3f}")
  print(f"{ac_score:.3f}, {rc_score:.3f}, {pr_score:.3f}, {f1_score:.3f}")
  print("===============")



    