# Imports and Env

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import os

In [4]:
os.chdir('drive/MyDrive/DLNLP-Project')

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch

In [7]:
import pandas as pd

In [8]:
import numpy as np

In [9]:
import seaborn as sns

In [10]:
from torch.utils.data import DataLoader, TensorDataset

In [11]:
from torch.optim import AdamW

In [12]:
from transformers import get_scheduler

In [13]:
from tqdm.auto import tqdm

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

# Load model

In [15]:
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained("models/transformers_test")

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411k [00:00<?, ?B/s]

In [16]:
# tokenize a sentence and run through the model
input_ids = torch.tensor(tokenizer.encode("Acesta este un test.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)

# get encoding
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [17]:
last_hidden_states.shape

torch.Size([1, 1])

In [18]:
last_hidden_states

tensor([[0.3124]], grad_fn=<AddmmBackward0>)

# Load Data

In [19]:
TRAIN_DATASET_PATH = "data/train.csv"

In [20]:
TEST_DATASET_PATH = "data/test.csv"

In [21]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)

In [22]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29651 entries, 0 to 29650
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    29651 non-null  int64  
 1   text     29651 non-null  object 
 2   label    29651 non-null  float64
 3   dataset  29651 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 926.7+ KB


In [23]:
test_df = pd.read_csv(TEST_DATASET_PATH, index_col="Unnamed: 0")

In [24]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23005 entries, 11262 to 4827
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   text     23005 non-null  object 
 1   label    23005 non-null  float64
 2   dataset  23005 non-null  object 
dtypes: float64(1), object(2)
memory usage: 718.9+ KB


# Prepare Testing

In [25]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [26]:
Xval = test_df["text"].tolist()

In [27]:
yval = test_df["label"].tolist()

In [28]:
Xval[0]

'Foarte slab. ca aspect este foarte frumoasa dar cine vrea sa se joace cu tastatura nu poate'

In [29]:
tokenizer.encode(Xval[0], add_special_tokens=True)

[2,
 757,
 7687,
 18,
 407,
 4596,
 439,
 757,
 8680,
 551,
 1005,
 1072,
 442,
 395,
 8726,
 386,
 29132,
 400,
 628,
 3]

In [30]:
yval[0]

0.0

In [31]:
Xt_tokens = tokenizer(Xval, return_tensors="pt", padding=True, truncation=True, max_length=128)

In [32]:
yt_tokens = torch.Tensor(yval)

In [33]:
Xt_tokens

{'input_ids': tensor([[    2,   757,  7687,  ...,     0,     0,     0],
        [    2,   757, 32457,  ...,     0,     0,     0],
        [    2,  4436,   623,  ...,     0,     0,     0],
        ...,
        [    2,   400,   553,  ...,     0,     0,     0],
        [    2,  3602,   874,  ...,  2034,     3,     0],
        [    2,   361,   956,  ...,   983,  1304,     3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [34]:
# Xval_tokens = tokenizer(Xval,return_tensors="pt", padding=True)

In [35]:
test_dataset = TensorDataset(Xt_tokens["input_ids"], Xt_tokens["token_type_ids"], Xt_tokens["attention_mask"])

In [36]:
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=4)

# Testing

In [37]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [38]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [39]:
predictions = []

In [40]:
for batch in test_dataloader:
    batch = {
        "input_ids":batch[0].to(device),
        "token_type_ids":batch[1].to(device),
        "attention_mask":batch[2].to(device)
    }
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions.append(logits)

In [41]:
yt_tokens

tensor([0., 1., 0.,  ..., 0., 0., 0.])

In [42]:
true_labels = yt_tokens.tolist()

In [43]:
predictions = [x.tolist() for x in predictions]

In [44]:
predicted_labels = []

In [45]:
for x in predictions:
  predicted_labels.extend(x)

In [46]:
mean_squared_error(predicted_labels, true_labels)

0.338012877004491

In [47]:
mean_absolute_error(predicted_labels, true_labels)

0.49387728054093144

In [48]:
def twofive_round(x):
    return round(x*4)/4

In [49]:
rounded_pred_labels = [twofive_round(x[0]) for x in predicted_labels]

In [50]:
rounded_pred_labels[:10]

[1.0, 0.5, 0.0, 0.5, 0.0, 0.25, 0.0, 0.25, 0.5, 0.25]

In [51]:
true_labels[:10]

[0.0, 1.0, 0.0, 0.0, 1.0, 0.75, 0.0, 1.0, 1.0, 1.0]

In [52]:
def make_class(x_list):
  return [int(4*x) for x in x_list]

In [53]:
accuracy_score(make_class(true_labels), make_class(rounded_pred_labels))

0.14244729406650727