In [1]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm
from sklearn.metrics import classification_report

In [2]:
df = pd.concat([pd.read_csv('./binary_train.csv'), pd.read_csv('./binary_test.csv')])

In [3]:
# Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [4]:
df['clean_text'] = df['text'].apply(preprocess_text)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['is_persuasion'], test_size=0.2, random_state=42)

In [6]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Tokenize and encode the training data
encoded_data_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
# Tokenize and encode the test data
encoded_data_test = tokenizer.batch_encode_plus(
    X_test.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

In [9]:
# Convert encoded data tensors to DataLoader objects
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.tolist())

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(y_test.tolist())

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)


In [10]:
batch_size = 16

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test,
                             sampler=SequentialSampler(dataset_test),
                             batch_size=batch_size)


In [11]:
# b = BertModel.from_pretrained('bert-base-uncased')

In [12]:
# output = b(input_ids, attention_mask)

In [13]:
# Define BERT-based classifier model
class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(output[1])
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [14]:
# Initialize the BERT classifier model
model = BERTClassifier()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [15]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [16]:
# Define binary cross-entropy loss
loss_fn = nn.BCELoss()

In [None]:

# Training loop
for epoch in tqdm(range(epochs),total=epochs):
    model.train()
    for batch in tqdm(dataloader_train, desc="Training"):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        scheduler.step()

  0%|          | 0/3 [00:00<?, ?it/s]
Training:   0%|          | 0/433 [00:00<?, ?it/s][A
Training:   0%|          | 1/433 [00:48<5:50:04, 48.62s/it][A
Training:   0%|          | 2/433 [01:30<5:22:39, 44.92s/it][A
Training:   1%|          | 3/433 [02:11<5:08:06, 42.99s/it][A
Training:   1%|          | 4/433 [02:51<4:57:50, 41.66s/it][A
Training:   1%|          | 5/433 [03:29<4:48:41, 40.47s/it][A
Training:   1%|▏         | 6/433 [04:08<4:43:13, 39.80s/it][A