In [1]:
!pip install transformers
!pip install emoji
# !pip install cloud-tpu-client==0.10 torch==1.10.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl



In [2]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, BertConfig
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score

import copy

# import torch_xla
# import torch_xla.core.xla_model as xm
# import torch_xla.distributed.parallel_loader as pl
# import torch_xla.distributed.xla_multiprocessing as xmp
# import torch_xla.utils.utils as xu

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# torch.cuda.empty_cache()

In [4]:
!nvidia-smi

Sun May  8 08:18:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    29W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

0
0


In [6]:
bert_model = "vinai/bertweet-base"
# bert_model = 'bert-base-uncased'
# bert_model = 'bert-large-uncased'
# bert_model = 'google/electra-small-discriminator'
# bert_model = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(bert_model)
bert = AutoModel.from_pretrained(bert_model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
class TweetDataset(Dataset):

    def __init__(self, path, tokenizer=tokenizer, is_test=False):

        self.df = pd.read_csv(path, delimiter = '\t')
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        
        tweets = self.df.loc[index, 'text']
        
        tweets = self.preprocess(tweets)
        inputs = self.tokenizer(tweets, padding='max_length', truncation=True, return_tensors="pt")
        
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        if not self.is_test:
            return input_ids, attention_mask, self.df.loc[index, 'label']
        else:
            return input_ids, attention_mask
    
    def preprocess(self, text):
        text = text.replace('\n', '')
        # text = text.replace('\n', '</s>')
        # text = re.sub(r'https?://t.co/[a-zA-Z0-9]+', '', text)

        return text

In [8]:
class RumourDetector(nn.Module):
    def __init__(self, bert=bert):
        super(RumourDetector, self).__init__()
        self.bert_block = bert
        self.hidden_size = BertConfig.from_pretrained(bert_model).hidden_size

        # 0.918
        # self.clf_block = nn.Sequential(
        #     nn.Dropout(0.7),
        #     nn.Linear(self.hidden_size, 1),
        #     nn.Sigmoid(),
        # )

        # 0.93
        # self.clf_block = nn.Sequential(
        #     nn.Linear(self.hidden_size, self.hidden_size),
        #     nn.Dropout(0.5),
        #     nn.Linear(self.hidden_size, 256),
        #     nn.Linear(256, 128),
        #     nn.Linear(128, 1),
        #     nn.Sigmoid(),
        # )

        self.clf_block = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.Dropout(0.7),
            nn.Linear(self.hidden_size, 256),
            nn.Linear(256, 128),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, tweets_seqs, attn_masks):
        cls_reps = self.bert_block(tweets_seqs, attention_mask=attn_masks).last_hidden_state[:, 0, :]

        # means = cls_reps.mean(dim=1, keepdim=True)
        # stds = cls_reps.std(dim=1, keepdim=True)
        # cls_reps = (cls_reps - means) / stds

        probs = self.clf_block(cls_reps)

        preds = (probs > 0.5).int()

        del tweets_seqs, cls_reps
        torch.cuda.empty_cache()

        return probs.flatten(), preds.flatten()


In [9]:
def train(train_status, model, optim, epoch_size, train_loader, valid_loader):
    loss_fn = nn.BCELoss()
    
    # max_valid_f1 = 0
    for epoch in range(epoch_size):
        model.train()
        epoch_loss = 0
        epoch_acc = 0
        train_loop = tqdm(enumerate(train_loader), total=len(train_loader))
        train_loop.set_description(f"Epoch [{epoch+1}/{epoch_size}]")

        for batch, (tweets_seqs, attention_masks, labels) in train_loop:
            tweets_seqs = tweets_seqs.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.float().to(device)
            probs, preds = model(tweets_seqs, attention_masks)
            loss = F.binary_cross_entropy(probs, labels)

            optim.zero_grad()
            loss.backward()
            optim.step() 

            epoch_loss += loss.item()
            epoch_acc += (preds == labels).float().mean().item()
            train_loop.set_postfix_str(
                'train_loss={:.5f}, train_acc={:.5f}'.format(
                    epoch_loss/(batch+1), epoch_acc/(batch+1)
                )
            )

            del tweets_seqs, attention_masks, labels
            torch.cuda.empty_cache()
        
            if batch == len(train_loader)-1:
                valid_acc, valid_f1 = validate(model, valid_loader)
                # if valid_f1 > max_valid_f1:
                    # max_valid_f1 = valid_f1
                train_status['checkpoint'][epoch] = copy.deepcopy(model.state_dict())
                train_status['valid_acc'].append(valid_acc)
                train_status['valid_f1'].append(valid_f1)
                train_status['train_loss'].append(epoch_loss/(batch+1))
                train_status['train_acc'].append(epoch_acc/(batch+1))
                train_loop.set_postfix_str(
                    'train_loss={:.5f}, train_acc={:.5f}, valid_acc={:.5f}, valid_f1={:.5f}'.format(
                        train_status['train_loss'][-1],
                        train_status['train_acc'][-1],
                        train_status['valid_acc'][-1],
                        train_status['valid_f1'][-1]
                    )
                )

    train_status['checkpoint']['train_status'] = train_status

def validate(model, valid_loader):
    model.eval()
    acc = 0
    tp, fp, fn = 0, 0, 0
    with torch.no_grad():
        for batch, (inputs, attention_masks, labels) in enumerate(valid_loader):
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.int().to(device)
            _, preds = model(inputs, attention_masks)
            

            confusion_vector = preds / labels
            tp += torch.sum(confusion_vector == 1).item()
            fp += torch.sum(confusion_vector == float('inf')).item()
            fn += torch.sum(confusion_vector == 0).item()

            acc += (preds == labels).float().mean()
            del inputs, attention_masks, labels, preds
            torch.cuda.empty_cache()

        if (tp + fp == 0):
            precision = 0
        else:
            precision = tp / (tp + fp)
        
        if (tp + fn == 0):
            recall = 0
        else:
            recall = tp / (tp + fn)
        
        if (precision + recall == 0):
            f1 = 0
        else:
            f1 = (2 * precision * recall) / (precision + recall)

    return acc / len(valid_loader), f1

In [10]:
from google.colab import drive
drive.mount('/content/gdrive/') 

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [11]:
epoch_size = 20
batch_size = 4
lr = 2e-5

train_set = TweetDataset('/content/gdrive/MyDrive/data/train.csv')
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)

valid_set = TweetDataset('/content/gdrive/MyDrive/data/dev.csv')
valid_loader = DataLoader(valid_set, batch_size=1, shuffle=True, num_workers=0)

In [12]:
model = RumourDetector().to(device)
optim = torch.optim.AdamW(model.parameters(), lr=lr)
train_status = {'train_loss': [], 'train_acc': [], 'valid_acc': [], 'valid_f1': [], 
                    'checkpoint': {}}
train(train_status, model, optim, epoch_size, train_loader, valid_loader)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Epoch [1/20]: 100%|██████████| 452/452 [01:02<00:00,  7.25it/s, train_loss=0.51240, train_acc=0.76825, valid_acc=0.77250, valid_f1=0.00000]
Epoch [2/20]: 100%|██████████| 452/452 [01:01<00:00,  7.33it/s, train_loss=0.42994, train_acc=0.79701, valid_acc=0.83701, valid_f1=0.52941]
Epoch [3/20]: 100%|██████████| 452/452 [00:58<00:00,  7.79it/s, train_loss=0.28473, train_acc=0.88993, valid_acc=0.88285, valid_f1=0.67606]
Epoch [4/20]: 100%|██████████| 452/452 [00:58<00:00,  7.75it/s, train_loss=0.16148, train_acc=0.94414, valid_acc=0.92699, valid_f1=0.84249]
Epoch [5/20]: 100%|██████████| 452/452 [00:58<00:00,  7.71it/s, train_loss=0.10222, train_acc=0.96958, valid_acc=0.90662, valid_f1=0.76190]
Epoch [6/20]: 100%|██████████| 452/452 [00:58<00:00,  7.74it/s, train_loss=0.08710, train_acc=0.97345, valid_acc=0.92020, valid_f1=0.82528]
Epoch [7

In [13]:
torch.save(train_status['checkpoint'][12], '/content/gdrive/MyDrive/model/tweet_bert_mlp_clf.pt')
# torch.save(train_status['checkpoint'], '/content/gdrive/MyDrive/model/train_status.pt')

In [14]:
model = RumourDetector()
model.load_state_dict(torch.load('/content/gdrive/MyDrive/model/tweet_bert_mlp_clf.pt'))
model.to(device)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


RumourDetector(
  (bert_block): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [15]:
def test(model, test_loader):
    model.eval()
    labels = []
    with torch.no_grad():
        for batch, (inputs, attention_masks) in enumerate(test_loader):
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)
            _, preds = model(inputs, attention_masks)
            preds = preds.tolist()
            labels.extend(preds)
            del inputs, attention_masks, preds
            torch.cuda.empty_cache()
    df = pd.DataFrame({'Id': list(range(0, len(test_loader))), 'Predicted': labels})
    df.to_csv('/content/gdrive/MyDrive/data/test.pred.csv', sep=',', index=False, encoding='utf-8')

In [16]:
test_set = TweetDataset('/content/gdrive/MyDrive/data/test.csv', is_test=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0)
test(model, test_loader=test_loader)