In [13]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import DataLoader
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## Dataset

In [14]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_LEN = 512
BATCH_SIZE = 10
RANDOM_SEED = 42

In [15]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [16]:
stances_headlines =  pd.read_csv('train_stances.csv')
bodies = pd.read_csv('train_bodies.csv')
stances_bodies = stances_headlines.merge(bodies,on='Body ID')

In [17]:
class StancesDataset(Dataset):
    def __init__(self, headlines, bodies, stances, tokenizer, max_len):
        self.headlines = headlines
        self.bodies = bodies
        self.stances = stances
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.categories = {"unrelated": 0, "agree": 1, "discuss": 2, "disagree": 3}


    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        headline = self.headlines[idx]
        body = self.bodies[idx]
        stance = self.stances[idx]
        stance_label = self.categories[stance]
        print(stance_label)

        inputs = self.tokenizer.encode_plus(
            headline,
            body,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            "labels": torch.tensor([stance_label], dtype=torch.long) 
        }

## Model

In [18]:
class StanceDetectionModel(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.l1 = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _,pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                                 return_dict=False)
        print('pooled output', pooled_output)
        output = self.l1(pooled_output)
        return F.softmax(output,dim=1)

## Training

In [40]:
def train(batch_size, data, model, learning_rate=0.0001, device='cpu'):
    loss_fn = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train().to(device)
    num_correct_predictions = 0
    num_samples = len(data)
    training_loss = []

    for i, input_data in enumerate(data):
        print('Batch #', i)
        input_ids = input_data['input_ids'].to(device)
        attention_mask = input_data['attention_mask'].to(device)
        token_type_ids = input_data['token_type_ids'].to(device)
        labels = input_data['labels'].to(device).squeeze()
        output = model(input_ids, attention_mask, token_type_ids)
        preds = torch.argmax(output, dim=1)
        num_correct_predictions += torch.sum(preds == labels)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        training_loss.append(loss.item())
    return num_correct_predictions.item()/len(data), np.mean(training_loss)


In [41]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = StancesDataset(df['Headline'].to_numpy(),
                        df['articleBody'].to_numpy(),
                        df['Stance'].to_numpy(),
                        tokenizer,
                        max_len)


    return DataLoader(
        ds,
        batch_size=batch_size
  )

df_train, df_test = train_test_split(stances_bodies, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
train_dataloader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_dataloader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [42]:
epochs = 1
batch_size=10
model = StanceDetectionModel(n_classes=4)
for i  in range(epochs):
    accuracy, mean_loss = train(batch_size,train_dataloader, model)
    print(f'Training accuracy at epoch {i} is {accuracy}')
    print(f'Mean loss at epoch {i} is {mean_loss}')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the retur

0
0
2
2
0
0
0
2
0
3
Batch # 0
pooled output tensor([[ 0.1037,  0.0398,  0.8267,  ...,  0.9604, -0.8614,  0.2715],
        [-0.0373,  0.0370,  0.9294,  ...,  0.9765, -0.8365,  0.5369],
        [-0.9572,  0.8552,  1.0000,  ...,  1.0000,  0.9836,  0.9970],
        ...,
        [-0.9621,  0.8896,  1.0000,  ...,  1.0000,  0.9946,  0.9977],
        [-0.9254,  0.8219,  1.0000,  ...,  1.0000,  0.9360,  0.9959],
        [-0.9064,  0.8377,  1.0000,  ...,  1.0000,  0.9603,  0.9926]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
0
2
0
0
0
0
0
0
0
Batch # 1
pooled output tensor([[-0.9261,  0.7486,  1.0000,  ...,  1.0000,  0.9276,  0.9961],
        [-0.8827,  0.6432,  1.0000,  ...,  1.0000,  0.7557,  0.9920],
        [-0.9618,  0.9024,  1.0000,  ...,  1.0000,  0.9940,  0.9985],
        ...,
        [-0.9710,  0.8754,  1.0000,  ...,  1.0000,  0.9966,  0.9989],
        [-0.9741,  0.9003,  1.0000,  ...,  1.0000,  0.9943,  0.9991],
        [-0.9867,  0.9194,  1.0000,  ...,  1.0000,  0.9992,  0.9992]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
0
0
0
0
1
0
0
0
2
Batch # 2
pooled output tensor([[-0.9419,  0.7415,  1.0000,  ...,  1.0000,  0.9501,  0.9977],
        [-0.9587,  0.7249,  1.0000,  ...,  1.0000,  0.9513,  0.9951],
        [-0.9623,  0.7934,  1.0000,  ...,  1.0000,  0.9919,  0.9996],
        ...,
        [-0.9501,  0.7590,  1.0000,  ...,  1.0000,  0.9774,  0.9990],
        [-0.9564,  0.8615,  1.0000,  ...,  1.0000,  0.9878,  0.9988],
        [-0.9691,  0.8730,  1.0000,  ...,  1.0000,  0.9854,  0.9981]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


1
0
0
0
2
0
0
0
0
2
Batch # 3
pooled output tensor([[-0.9589,  0.7614,  1.0000,  ...,  1.0000,  0.9887,  0.9981],
        [-0.7548,  0.3109,  0.9998,  ...,  0.9999,  0.1014,  0.9896],
        [-0.8522,  0.3247,  0.9999,  ...,  1.0000,  0.1461,  0.9934],
        ...,
        [-0.7301,  0.3319,  0.9999,  ...,  1.0000,  0.3705,  0.9959],
        [-0.7710,  0.2339,  0.9998,  ...,  1.0000, -0.2915,  0.9932],
        [-0.9198,  0.4120,  1.0000,  ...,  1.0000,  0.8833,  0.9969]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

0
0
0
0
0
0
0
2
0
0
Batch # 4
pooled output tensor([[-0.8792,  0.3361,  0.9999,  ...,  1.0000,  0.6291,  0.9981],
        [-0.8911,  0.1503,  0.9999,  ...,  1.0000,  0.9042,  0.9978],
        [-0.8912,  0.2872,  0.9999,  ...,  1.0000,  0.8233,  0.9974],
        ...,
        [-0.9260,  0.3662,  1.0000,  ...,  1.0000,  0.9329,  0.9983],
        [-0.8908,  0.2173,  0.9999,  ...,  1.0000,  0.8463,  0.9992],
        [-0.8204,  0.0610,  0.9997,  ...,  0.9999,  0.7661,  0.9940]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


3
0
0
0
0
0
0
0
0
0
Batch # 5
pooled output tensor([[-0.8616, -0.1285,  0.9986,  ...,  0.9996,  0.9808,  0.9948],
        [-0.8837, -0.0681,  0.9996,  ...,  0.9999,  0.9741,  0.9979],
        [-0.8340,  0.0742,  0.9999,  ...,  1.0000,  0.9808,  0.9984],
        ...,
        [-0.8735, -0.1130,  0.9996,  ...,  0.9999,  0.9723,  0.9974],
        [-0.8863,  0.2427,  1.0000,  ...,  1.0000,  0.9935,  0.9993],
        [-0.7920, -0.1607,  0.9994,  ...,  0.9998,  0.9795,  0.9960]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
0
1
0
0
0
0
2


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


2
2
Batch # 6
pooled output tensor([[-0.8480, -0.1578,  0.9992,  ...,  0.9998,  0.9480,  0.9986],
        [-0.6945, -0.1061,  0.9994,  ...,  0.9998,  0.8811,  0.9988],
        [-0.8050, -0.1161,  0.9995,  ...,  0.9999,  0.9584,  0.9990],
        ...,
        [-0.8904,  0.1069,  0.9996,  ...,  0.9999,  0.9557,  0.9980],
        [-0.8450,  0.1204,  0.9998,  ...,  1.0000,  0.8657,  0.9995],
        [-0.8293, -0.0658,  0.9997,  ...,  0.9999,  0.8939,  0.9991]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
0
2
1
1
2
2
0
2
1
Batch # 7
pooled output tensor([[-7.8954e-01, -7.1935e-02,  9.9929e-01,  ...,  9.9988e-01,
          6.0853e-01,  9.9946e-01],
        [-7.0135e-01, -3.7250e-02,  9.9830e-01,  ...,  9.9977e-01,
          5.3845e-01,  9.9833e-01],
        [-7.4518e-01, -7.4518e-02,  9.9954e-01,  ...,  9.9992e-01,
          8.0114e-01,  9.9923e-01],
        ...,
        [-6.5608e-01, -9.1109e-02,  9.9452e-01,  ...,  9.9886e-01,
          6.8177e-01,  9.9758e-01],
        [-8.0949e-01,  6.3166e-02,  9.9992e-01,  ...,  9.9999e-01,
          9.1782e-01,  9.9967e-01],
        [-8.1969e-01,  4.6320e-04,  9.9967e-01,  ...,  9.9992e-01,
          9.0325e-01,  9.9949e-01]], grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
0
0
0
0
1
0
1
0
0
Batch # 8
pooled output tensor([[-0.8426,  0.0031,  0.9997,  ...,  0.9999,  0.4009,  0.9995],
        [-0.8445, -0.0290,  0.9992,  ...,  0.9998,  0.6790,  0.9992],
        [-0.7967, -0.0739,  0.9982,  ...,  0.9997,  0.5313,  0.9989],
        ...,
        [-0.9009,  0.2539,  0.9998,  ...,  0.9999,  0.6752,  0.9997],
        [-0.8372,  0.1102,  0.9997,  ...,  0.9999,  0.7119,  0.9996],
        [-0.7752,  0.1151,  0.9989,  ...,  0.9998,  0.2693,  0.9995]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

0
0
0
0
0
0
1
0
0
2
Batch # 9
pooled output tensor([[-0.7824,  0.1822,  0.9990,  ...,  0.9998,  0.6178,  0.9983],
        [-0.7550,  0.1738,  0.9990,  ...,  0.9999,  0.3547,  0.9995],
        [-0.7831, -0.0273,  0.9990,  ...,  0.9998,  0.6928,  0.9992],
        ...,
        [-0.8453,  0.1784,  0.9996,  ...,  0.9999,  0.3646,  0.9996],
        [-0.8004,  0.0811,  0.9997,  ...,  0.9999,  0.5020,  0.9994],
        [-0.8894,  0.2987,  0.9998,  ...,  1.0000,  0.6794,  0.9996]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

0
2
2
0
0
2
0
0
0
0
Batch # 10
pooled output tensor([[-0.8911,  0.1204,  0.9998,  ...,  1.0000,  0.7748,  0.9996],
        [-0.8977,  0.3019,  0.9998,  ...,  0.9999,  0.6752,  0.9992],
        [-0.8778,  0.2068,  0.9998,  ...,  1.0000,  0.8532,  0.9997],
        ...,
        [-0.7653,  0.2389,  0.9989,  ...,  0.9999,  0.5381,  0.9992],
        [-0.8501,  0.1349,  0.9998,  ...,  1.0000,  0.7135,  0.9996],
        [-0.8700,  0.1236,  0.9998,  ...,  0.9999,  0.8288,  0.9995]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

0
1
1
0
0
0
0
1
0
0
Batch # 11
pooled output tensor([[-0.8716,  0.2865,  0.9998,  ...,  1.0000,  0.7128,  0.9995],
        [-0.9304,  0.4074,  0.9999,  ...,  1.0000,  0.7721,  0.9997],
        [-0.9292,  0.4133,  1.0000,  ...,  1.0000,  0.8715,  0.9997],
        ...,
        [-0.8986,  0.2149,  0.9997,  ...,  0.9999,  0.8858,  0.9981],
        [-0.8675,  0.1984,  0.9998,  ...,  0.9999,  0.3797,  0.9992],
        [-0.8787,  0.1824,  0.9998,  ...,  0.9999,  0.6140,  0.9997]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
2
1
0
0
0
0
0
0
0
Batch # 12
pooled output tensor([[-0.9073,  0.3027,  0.9999,  ...,  1.0000,  0.6604,  0.9997],
        [-0.9163,  0.3473,  0.9999,  ...,  1.0000,  0.8487,  0.9996],
        [-0.8979,  0.3310,  1.0000,  ...,  1.0000,  0.9431,  0.9997],
        ...,
        [-0.9107,  0.4306,  0.9999,  ...,  1.0000,  0.7529,  0.9993],
        [-0.9149,  0.2578,  0.9999,  ...,  1.0000,  0.7848,  0.9996],
        [-0.8831,  0.2863,  0.9999,  ...,  1.0000,  0.5975,  0.9996]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
0
0
0
0
0
0
0
2
0
Batch # 13
pooled output tensor([[-0.8688,  0.3962,  0.9998,  ...,  0.9999,  0.4051,  0.9994],
        [-0.8819,  0.3758,  0.9997,  ...,  0.9999,  0.4925,  0.9994],
        [-0.8834,  0.3145,  0.9999,  ...,  1.0000,  0.6090,  0.9997],
        ...,
        [-0.8892,  0.4213,  0.9998,  ...,  1.0000,  0.3948,  0.9997],
        [-0.9120,  0.4251,  0.9999,  ...,  1.0000,  0.7716,  0.9998],
        [-0.9153,  0.4886,  1.0000,  ...,  1.0000,  0.6212,  0.9998]],
       grad_fn=<TanhBackward0>)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

0
2
0
0
0
2
3
3
0
2
Batch # 14
pooled output tensor([[-0.9060,  0.2048,  0.9998,  ...,  1.0000,  0.5192,  0.9996],
        [-0.9374,  0.4360,  1.0000,  ...,  1.0000,  0.7055,  0.9996],
        [-0.8868,  0.3347,  0.9999,  ...,  1.0000,  0.1463,  0.9997],
        ...,
        [-0.9068,  0.4089,  0.9999,  ...,  1.0000,  0.6905,  0.9997],
        [-0.9137,  0.3917,  0.9999,  ...,  1.0000,  0.6101,  0.9995],
        [-0.9276,  0.3491,  0.9999,  ...,  1.0000,  0.6802,  0.9998]],
       grad_fn=<TanhBackward0>)


KeyboardInterrupt: 