In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3" # if using multiple gpu

In [2]:
import numpy as np
import pandas as pd
from collections import deque

import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import BertModel, BertForPreTraining, AutoTokenizer
import json
from tqdm import tqdm, trange
import emoji

random_seed = 0
torch.manual_seed(random_seed)


<torch._C.Generator at 0x7f43917eff30>

In [3]:
def extract_emojis (tw):
    # Returns emojis in a list for a given tweet
    # Using Deque for a sliding window (emojis can be combined together to form other emojis)
    
    emojis = []
    
    l = []
    max_l = 7
    
    if len(tw)>=max_l:

        for i in range(0, max_l):
            l.append(tw[-1-i])
        l = deque(l, maxlen=max_l)
        skip=0

        for i in range (0, len(tw)):
            if skip == 0:
                for j in range (max_l-1, -1, -1):
                    str_to_test = ''
                    for k in range (0, j+1):
                        str_to_test+=l[j-k]
                    if str_to_test in emoji.UNICODE_EMOJI:

                        emojis.append(str_to_test)
                        skip=j
                        break
                try:
                    l.append(tw[-1-i-max_l])
                except IndexError:
                    l.append('')
            else:
                skip=skip-1
                try:
                    l.append(tw[-1-i-max_l])
                except IndexError:
                    l.append('')
        emojis.reverse()
    else:
        emojis = []
    return emojis

def remove_hashtags(tweets):
    # Remove the # char
    
    tweets = [tw.replace('#', '') for tw in tweets]
    return tweets

def replace_emojis(tweets):
    # Replace emojis with their description
    
    tweets_no_emojis = []
    for tw in tweets:
        emojis = extract_emojis(tw)
        for e in emojis:
            e_text = emoji.UNICODE_EMOJI[e].replace('_',' ').replace(':', '')
            tw = tw.replace(e, e_text)
        tweets_no_emojis.append(tw)

    return tweets_no_emojis
    
        

In [30]:
remove_hashtags_flag = True
replace_emojis_flag = True
bw_flag = False
model_task = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [8]:
weights_task1 = torch.tensor([0.1, 0.1, 0.1])
class CovidTwitterBertClassifier_task1(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)
        
        if n_classes >1:
            self.criterion = nn.CrossEntropyLoss(weight=weights_task1)
        else:
            self.criterion = nn.MSELoss()
        
    def forward(self, input_ids, token_type_ids, input_mask, labels):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
        
        logits = outputs[1]
        
        loss = self.criterion(logits, labels)
        return loss, logits

weights_task2 = torch.tensor([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) .to(device)  
class CovidTwitterBertClassifier_task2(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.sigmoid = nn.Sigmoid()
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)
        
        if n_classes >1:
            self.criterion = nn.BCELoss(reduction='none')
        else:
            self.criterion = nn.MSELoss()
        
    def forward(self, input_ids, token_type_ids, input_mask, labels):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
        logits = outputs[1]
        logits = self.sigmoid(logits)
        
        
        loss = self.criterion(logits, labels)
        
        loss = (loss * weights_task2).mean()
        
        return loss, logits
    
weights_inter_conspiracies = torch.tensor([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]).to(device)
weights_intra_conspiracy = torch.tensor([[0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1],
                                         [0.1, 0.1, 0.1]]).to(device)

class CovidTwitterBertClassifier_task3(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)

        self.criterions = []
        if n_classes >1:
            for i in range(0, 9):
                self.criterions.append(nn.CrossEntropyLoss(weight = weights_intra_conspiracy[i]))
            
            self.criterion = nn.CrossEntropyLoss()
            #self.criterion = nn.BCEWithLogitsLoss()
            
        else:
            self.criterion = nn.MSELoss()
        
    def forward(self, input_ids, token_type_ids, input_mask, labels):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)

        #outputs = self.classifier(outputs.pooler_output)
        logits = outputs[1]
        
        if self.n_classes == 1:
            labels=labels.float()
            
        losses = [0,0,0,0,0,0,0,0,0]
        
        loss = 0
        
        for i in range(0, 9):
            logits_i = logits[:,3*i:3*i+3]
            label_i = labels[:, i].long()
            losses[i] = self.criterions[i](logits_i, label_i)
            loss +=self.criterions[i](logits_i, label_i)
        loss = loss/9
        #losses = torch.tensor(losses)
        #loss = (losses * weights).mean()
        
        return loss, logits
    

In [6]:
model_task1 = CovidTwitterBertClassifier_task1(3)
    

model_task1.cuda()
model_task1.load_state_dict(torch.load('../models/task1.pth'))
model_task1.eval()

CovidTwitterBertClassifier_task1(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_feat

In [7]:
model_task2 = CovidTwitterBertClassifier_task2(9)
    
    
model_task2.cuda()
model_task2.load_state_dict(torch.load('../models/task2.pth'))
model_task2.eval()

CovidTwitterBertClassifier_task2(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_feat

In [11]:
model_task3 = CovidTwitterBertClassifier_task3(9*3)
    
    
model_task3.cuda()
model_task3.load_state_dict(torch.load('../models/task3.pth'))
model_task3.eval()

CovidTwitterBertClassifier_task3(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_feat

In [36]:
text = ["This is outrageous! This politician lied about the vaccine, as it contains 5G and has been made to control population!",
        "My colleague think Covid is a hoax and has been staged, how do I prove him wrong?",
        "The vaccine contains the mark of the beast! It is the devil's work! Also the deep state is holding all together, the new world order is upon us",
        "What about climate change? Ice is melting and temperatures keep getting warmer. I hope we find a durable solution to all this"
       ]

In [37]:
if remove_hashtags_flag:
    text = remove_hashtags(text)

if replace_emojis_flag:
    text = replace_emojis(text)

In [38]:
tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

MAX_LEN = 128

tokenized_input = tokenizer(text, max_length=MAX_LEN, padding='max_length', truncation=True)

input_ids, token_type_ids, attention_mask = tokenized_input['input_ids'], tokenized_input['token_type_ids'], tokenized_input['attention_mask']
labels_task1 = [1 for i in range(0, len(text))]
labels_task2 = [[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0] for i in range(0, len(text))]

input_ids = (torch.tensor(input_ids))
token_type_ids = (torch.tensor(token_type_ids))
attention_mask = (torch.tensor(attention_mask))
labels_task1 = (torch.tensor(labels_task1))
labels_task2 = (torch.tensor(labels_task2))


In [39]:
batch_size = 64

dataset = TensorDataset(input_ids, token_type_ids, attention_mask, labels_task1, labels_task2)
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=64)

output_task1 = []
output_task2 = []
output_task3 = []

for b in tqdm(dataloader):

    batch = b
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_token_type_ids, b_attention_mask, b_labels_task1, b_labels_task2 = batch

    with torch.no_grad():
        logits_task1 = model_task1(b_input_ids, b_token_type_ids, b_attention_mask, b_labels_task1)[1]
        output_task1 += logits_task1.argmax(dim=1).cpu().numpy().tolist()

        logits_task2 = model_task2(b_input_ids, b_token_type_ids, b_attention_mask, b_labels_task2)[1]
        output_task2 += (logits_task2>0.5).squeeze().cpu().numpy().tolist()
        
        logits_task3 = model_task3(b_input_ids, b_token_type_ids, b_attention_mask, b_labels_task2)[1]
        for p in logits_task3:
            output_task3+=[[p[3*i: 3*i+3].argmax().item() for i in range(0,9)]]


100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.16it/s]


In [40]:
output_task3

[[0, 2, 2, 0, 0, 0, 2, 0, 0],
 [0, 0, 0, 0, 2, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 2, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [45]:
conspiracies = ['Suppressed Cures',
     'Behaviour and Mind Control',
     'Antivax',
     'Fake virus',
     'Intentional Pandemic',
     'Harmful Radiation/ Influence',
     'Population reduction',
     'New World Order',
     'Satanism']

conspiracy_levels = ["No Conspiracy", "Discussing", "Supporting"]

def print_results_model3(output):
    
    for i in range(0, len(text)):
        
        t = text[i]
        output_i = output[i]
        
        print(t)
        
        result = ''
        if sum(output_i) == 0:
            result = 'No conspiracy'
        else:
            for i in range(0, 9):
                if output_i[i]:
                    result+=', '
                    result+=conspiracy_levels[output_i[i]]
                    result+=' '
                    result+=conspiracies[i]
            if result:
                result = result[2:]
             
        print("---> " +result)
        print('-----'*5)


def print_results_model12(output_task1, output_task2):
    
    for i in range(0, len(text)):
        
        t = text[i]
        task1 = output_task1[i]
        task2 = output_task2[i]

        print(t)
        
        result = ''
        if task1 == 0:
            result = 'No conspiracy'
        else:
            for i in range(0, 9):
                if task2[i]:
                    result+=', '
                    result+=conspiracy_levels[task1]
                    result+=' '
                    result+=conspiracies[i]
            if result:
                result = result[2:]
             
        print("---> " +result)
        print('-----'*5)

In [46]:
print_results_model3(output_task3)

This is outrageous! This politician lied about the vaccine, as it contains 5G and has been made to control population!
---> Supporting Behaviour and Mind Control, Supporting Antivax, Supporting Population reduction
-------------------------
My colleague think Covid is a hoax and has been staged, how do I prove him wrong?
---> Supporting Intentional Pandemic
-------------------------
The vaccine contains the mark of the beast! It is the devil's work! Also the deep state is holding all together, the new world order is upon us
---> Supporting New World Order, Supporting Satanism
-------------------------
What about climate change? Ice is melting and temperatures keep getting warmer. I hope we find a durable solution to all this
---> No conspiracy
-------------------------


In [48]:
print_results_model12(output_task1, output_task2)

This is outrageous! This politician lied about the vaccine, as it contains 5G and has been made to control population!
---> Discussing Suppressed Cures, Discussing Harmful Radiation/ Influence, Discussing Satanism
-------------------------
My colleague think Covid is a hoax and has been staged, how do I prove him wrong?
---> Discussing Suppressed Cures, Discussing Fake virus, Discussing Population reduction
-------------------------
The vaccine contains the mark of the beast! It is the devil's work! Also the deep state is holding all together, the new world order is upon us
---> Supporting Suppressed Cures, Supporting Fake virus, Supporting Population reduction, Supporting Satanism
-------------------------
What about climate change? Ice is melting and temperatures keep getting warmer. I hope we find a durable solution to all this
---> No conspiracy
-------------------------
