# Imports and Env

In [1]:
# from google.colab import drive

In [2]:
# drive.mount('/content/drive/')

In [3]:
import os

In [4]:
os.chdir("..")

In [5]:
# os.chdir('drive/MyDrive/DLNLP-Project')

In [6]:
# !pip install transformers

In [7]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import pandas as pd

In [9]:
import numpy as np

In [10]:
import seaborn as sns

In [11]:
from torch.utils.data import DataLoader, TensorDataset

In [12]:
from torch.optim import AdamW

In [13]:
from transformers import get_scheduler

In [14]:
from tqdm.auto import tqdm

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

# Load model

In [16]:
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained("models/transformers_test")

In [17]:
# tokenize a sentence and run through the model
input_ids = torch.tensor(tokenizer.encode("Acesta este un test.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)

# get encoding
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [18]:
last_hidden_states.shape

torch.Size([1, 1])

In [19]:
last_hidden_states

tensor([[0.3281]], grad_fn=<AddmmBackward0>)

# Load Data

In [20]:
TRAIN_DATASET_PATH = "data/train.csv"

In [21]:
TEST_DATASET_PATH = "data/test.csv"

In [22]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29651 entries, 0 to 29650
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    29651 non-null  int64  
 1   text     29651 non-null  object 
 2   label    29651 non-null  float64
 3   dataset  29651 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 926.7+ KB


In [24]:
test_df = pd.read_csv(TEST_DATASET_PATH, index_col="Unnamed: 0")

In [25]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23005 entries, 11262 to 4827
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   text     23005 non-null  object 
 1   label    23005 non-null  float64
 2   dataset  23005 non-null  object 
dtypes: float64(1), object(2)
memory usage: 718.9+ KB


# Prepare Testing

In [26]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [27]:
Xval = test_df["text"].tolist()

In [28]:
yval = test_df["label"].tolist()

In [29]:
Xval[0]

'Foarte slab. ca aspect este foarte frumoasa dar cine vrea sa se joace cu tastatura nu poate'

In [30]:
tokenizer.encode(Xval[0], add_special_tokens=True)

[2,
 757,
 7687,
 18,
 407,
 4596,
 439,
 757,
 8680,
 551,
 1005,
 1072,
 442,
 395,
 8726,
 386,
 29132,
 400,
 628,
 3]

In [31]:
yval[0]

0.0

In [32]:
Xt_tokens = tokenizer(Xval[:128], return_tensors="pt", padding=True, truncation=True, max_length=128)

In [33]:
yt_tokens = torch.Tensor(yval[:128])

In [34]:
Xt_tokens

{'input_ids': tensor([[    2,   757,  7687,  ...,     0,     0,     0],
        [    2,   757, 32457,  ...,     0,     0,     0],
        [    2,  4436,   623,  ...,     0,     0,     0],
        ...,
        [    2,   400,  3133,  ...,     0,     0,     0],
        [    2,   757,  7687,  ...,     0,     0,     0],
        [    2,   757, 32457,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [35]:
# Xval_tokens = tokenizer(Xval,return_tensors="pt", padding=True)

In [36]:
test_dataset = TensorDataset(Xt_tokens["input_ids"], Xt_tokens["token_type_ids"], Xt_tokens["attention_mask"])

In [37]:
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=4)

# Testing

In [38]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [39]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [40]:
predictions = []

In [41]:
for batch in test_dataloader:
    batch = {
        "input_ids":batch[0].to(device),
        "token_type_ids":batch[1].to(device),
        "attention_mask":batch[2].to(device)
    }
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions.append(logits)

In [42]:
yt_tokens

tensor([0.0000, 1.0000, 0.0000, 0.0000, 1.0000, 0.7500, 0.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 0.0000, 0.7500, 0.0000, 1.0000, 0.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 0.0000, 0.2500, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000,
        0.7500, 0.0000, 0.0000, 0.2500, 1.0000, 1.0000, 0.0000, 0.0000, 0.2500,
        1.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.2500, 0.0000, 0.0000,
        0.7500, 1.0000, 1.0000, 0.7500, 1.0000, 0.0000, 1.0000, 1.0000, 1.0000,
        0.0000, 1.0000, 0.2500, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 1.0000,
        0.7500, 0.0000, 0.7500, 0.2500, 0.7500, 0.0000, 1.0000, 1.0000, 0.0000,
        0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 1.0000, 1.0000, 0.0000, 1.0000,
        1.0000, 1.0000, 0.2500, 1.0000, 0.0000, 0.0000, 1.0000, 0.2500, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0000, 0.2500, 0.0000, 0.7500, 0.2500, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 

In [43]:
true_labels = yt_tokens.tolist()

In [44]:
predictions = [x.tolist() for x in predictions]

In [45]:
predicted_labels = []

In [46]:
for x in predictions:
  predicted_labels.extend(x)

In [47]:
mean_squared_error(predicted_labels, true_labels)

0.3432507971946457

In [48]:
mean_absolute_error(predicted_labels, true_labels)

0.5013432936157187

In [49]:
def twofive_round(x):
    return round(x*4)/4

In [50]:
rounded_pred_labels = [twofive_round(x[0]) for x in predicted_labels]

In [51]:
rounded_pred_labels[:10]

[0.5, 1.0, 0.5, 0.5, 0.0, 0.25, 0.75, 0.0, 0.75, 0.0]

In [52]:
true_labels[:10]

[0.0, 1.0, 0.0, 0.0, 1.0, 0.75, 0.0, 1.0, 1.0, 1.0]

In [53]:
def make_class(x_list):
  return [int(4*x) for x in x_list]

In [54]:
accuracy_score(make_class(true_labels), make_class(rounded_pred_labels))

0.109375