In [1]:
!pip install transformers



In [2]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, BertConfig
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score

import copy
import string

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
!nvidia-smi

Sun May  8 01:57:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    26W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
bert_model = "vinai/bertweet-base"
# bert_model = 'bert-base-uncased'
# bert_model = 'bert-large-uncased'
# bert_model = 'google/electra-small-discriminator'
# bert_model = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(bert_model)
bert = AutoModel.from_pretrained(bert_model)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
class CovidTweetDataset(Dataset):

    def __init__(self, path, tokenizer=tokenizer):

        self.df = pd.read_csv(path, delimiter = '\t')
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        
        tweets = self.df.loc[index, 'text']
        
        inputs = self.tokenizer(self.preprocess(tweets), padding='max_length', truncation=True, return_tensors="pt")
        
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return tweets, input_ids, attention_mask
    
    def preprocess(self, text):
        text = text.replace('\n', ' ')
        text = text.lower()
        
        # Remove mention
        # text = re.sub(r"@\S+", "", text)
        text = re.sub(r"@", "", text)
        
        # Remove URL
        text = re.sub(r'http\S+', 'http', text)
        
        # Remove punctuation and digits
        text = text.translate(str.maketrans(dict.fromkeys(string.punctuation+string.digits, '')))
        
        # Remove non ascii char
        text = re.sub(r'[^\x00-\x7F]', '', text)
        
        # Remove redundant space
        # text = re.sub(r' +', ' ', text)
        # text = [t.strip() for t in text.split('\n') if t.strip() != '']
        # text = ' '.join(text)
    
        return text

In [6]:
class RumourDetector(nn.Module):
    def __init__(self, bert=bert):
        super(RumourDetector, self).__init__()
        self.bert_block = bert
        self.hidden_size = BertConfig.from_pretrained(bert_model).hidden_size

        # 0.918
        # self.clf_block = nn.Sequential(
        #     nn.Dropout(0.7),
        #     nn.Linear(self.hidden_size, 1),
        #     nn.Sigmoid(),
        # )

        # 0.93
        self.clf_block = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.Dropout(0.5),
            nn.Linear(self.hidden_size, 256),
            nn.Linear(256, 128),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, tweets_seqs, attn_masks):
        cls_reps = self.bert_block(tweets_seqs, attention_mask=attn_masks).last_hidden_state[:, 0, :]

        probs = self.clf_block(cls_reps)

        preds = (probs > 0.5).int()

        del tweets_seqs, cls_reps
        torch.cuda.empty_cache()

        return probs.flatten(), preds.flatten()


In [7]:
from google.colab import drive
drive.mount('/content/gdrive/') 

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
model = RumourDetector()
model.load_state_dict(torch.load('/content/gdrive/MyDrive/model/tweet_bert_mlp_clf_testf1_0.93.pt'))
model.to(device)

In [11]:
covid_set = CovidTweetDataset('/content/gdrive/MyDrive/data/covid.csv')
covid_loader = DataLoader(covid_set, batch_size=128, num_workers=0)

In [13]:
model.eval()
tweets = []
labels = []
with torch.no_grad():
    for batch, (texts, inputs, attention_masks) in enumerate(covid_loader):
        inputs = inputs.to(device)
        attention_masks = attention_masks.to(device)
        _, preds = model(inputs, attention_masks)
        labels.extend(preds.tolist())
        tweets.extend(list(texts))
        del inputs, attention_masks, preds
        torch.cuda.empty_cache()
df = pd.DataFrame({'tweet': tweets, 'prediction': labels})
# df.to_csv('/content/gdrive/MyDrive/data/test.pred.csv', sep=',', index=False, encoding='utf-8')

In [18]:
df[df.prediction == 1]

Unnamed: 0,tweet,prediction
4,"“If Trump felt comfortable having it here, the...",1
7,KHive has become the most toxic thing on Twitt...,1
16,America has made incredible strides despite wh...,1
18,Walls work! https://t.co/4QMrHfODPM\n@WhiteHou...,1
21,Trump has royally fucked up this country.\n@St...,1
...,...,...
17008,Revealed: leader of group peddling bleach as c...,1
17015,😂😂😂\n\nOnly 632 People Watch Sleepy Joe Biden'...,1
17017,I wonder how many lives could’ve been saved if...,1
17019,Trump just completed the racism trifecta in a ...,1
