# Imports and Env

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import os

In [4]:
os.chdir('drive/MyDrive/DLNLP-Project')

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 55.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 60.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch

In [7]:
import pandas as pd

In [8]:
import numpy as np

In [9]:
import seaborn as sns

In [10]:
from torch.utils.data import DataLoader, TensorDataset

In [11]:
from torch.optim import AdamW

In [12]:
from transformers import get_scheduler

In [13]:
from tqdm.auto import tqdm

# Load model

In [14]:
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", num_labels=1)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of the model checkpoint at dumitrescustefan/bert-base-romanian-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceCla

In [15]:
# tokenize a sentence and run through the model
input_ids = torch.tensor(tokenizer.encode("Acesta este un test.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)

# get encoding
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [16]:
last_hidden_states.shape

torch.Size([1, 1])

In [17]:
last_hidden_states

tensor([[-0.4116]], grad_fn=<AddmmBackward0>)

# Load Data

In [18]:
TRAIN_DATASET_PATH = "data/train.csv"

In [19]:
TEST_DATASET_PATH = "data/test.csv"

In [20]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)

In [21]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29651 entries, 0 to 29650
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    29651 non-null  int64  
 1   text     29651 non-null  object 
 2   label    29651 non-null  float64
 3   dataset  29651 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 926.7+ KB


In [22]:
test_df = pd.read_csv(TEST_DATASET_PATH, index_col="Unnamed: 0")

In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23005 entries, 11262 to 4827
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   text     23005 non-null  object 
 1   label    23005 non-null  float64
 2   dataset  23005 non-null  object 
dtypes: float64(1), object(2)
memory usage: 718.9+ KB


# Prepare Training

In [24]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [25]:
Xt = train_df["text"].tolist()

In [26]:
yt = train_df["label"].tolist()

In [27]:
Xval = test_df["text"].tolist()

In [28]:
yval = test_df["label"].tolist()

In [29]:
Xt[0]

'Foarte slab. ca aspect este foarte frumoasa dar cine vrea sa se joace cu tastatura nu poate'

In [30]:
tokenizer.encode(Xt[0], add_special_tokens=True)

[2,
 757,
 7687,
 18,
 407,
 4596,
 439,
 757,
 8680,
 551,
 1005,
 1072,
 442,
 395,
 8726,
 386,
 29132,
 400,
 628,
 3]

In [31]:
yt[0]

0.0

In [32]:
Xt_tokens = tokenizer(Xt[:128], return_tensors="pt", padding=True, truncation=True, max_length=128)

In [33]:
yt_tokens = torch.Tensor(yt[:128])

In [34]:
Xt_tokens

{'input_ids': tensor([[    2,   757,  7687,  ...,     0,     0,     0],
        [    2,   757, 32457,  ...,     0,     0,     0],
        [    2,  4436,   623,  ...,     0,     0,     0],
        ...,
        [    2,   400,  3133,  ...,     0,     0,     0],
        [    2,   757,  7687,  ...,     0,     0,     0],
        [    2,   757, 32457,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [35]:
# Xval_tokens = tokenizer(Xval,return_tensors="pt", padding=True)

In [36]:
train_dataset = TensorDataset(Xt_tokens["input_ids"], Xt_tokens["token_type_ids"], Xt_tokens["attention_mask"], yt_tokens)

In [37]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)

# Fine tuning

In [38]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [39]:
num_epochs = 3

In [40]:
num_training_steps = num_epochs * len(train_dataloader)

In [41]:
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [42]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [43]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [44]:
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {
            "input_ids":batch[0].to(device),
            "token_type_ids":batch[1].to(device),
            "attention_mask":batch[2].to(device),
            "labels":batch[3].to(device),
        }
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/96 [00:00<?, ?it/s]

# Save Model

In [47]:
model.save_pretrained("models/transformers_test")

In [48]:
AutoModelForSequenceClassification.from_pretrained("models/transformers_test")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element