In [2]:
import os
import numpy as np
import pandas as pd
from collections import deque
import copy

import torch
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForPreTraining
from transformers import BertModel, AdamW, AutoTokenizer, BertForSequenceClassification, RobertaForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau

from tqdm import tqdm, trange

import emoji
from nltk.corpus import stopwords

random_seed = 0
torch.manual_seed(random_seed)

conspiracies = ['Suppressed Cures',
     'Behaviour and Mind Control',
     'Antivax',
     'Fake virus',
     'Intentional Pandemic',
     'Harmful Radiation/ Influence',
     'Population reduction',
     'New World Order',
     'Satanism']

In [3]:
#model_name = 'bert-base-cased'
# model_name = 'roberta-base'
model_name = 'twitter'
# model_name = 'roberta-large'

replace_lowercase_flag = False
remove_stopwords_flag = False
remove_hashtags_flag = True
replace_emojis_flag = True
clean_tweets_flag = False

all_data = False
class_weights_flag = True # always true for now

classification = True

# fold
k=0

In [4]:
def clean_tweets(tweets):
    char_to_remove = ['\n', '\xa0']
    corona_synonyms = ['coronavirus',
                      'covid-19',
                      'covid19',
                      'covid 19',
                      'covid',
                      'corona',
                      'sarscov2'
                      'sars',
                      'Coronaviruses',
                      'Coronavirus',
                      'Corona',
                      'Covid19',
                      'COVID19',
                      'Covid-19',
                      'COVID-19',
                      'COVID 19',
                      'Covid',
                      'COVID',
                      'SARSCOV2',
                      'SARS']
    
    tweets_clean = []
    for tw in tweets:
        for c in char_to_remove:
            tw = tw.replace(c, '')
        tw = tw.replace('&amp;', '&')
        
        for syn in corona_synonyms:
            if syn in tw:
                tw = tw.replace(syn, 'wuhan virus')
        tweets_clean.append(tw)
    return tweets_clean

def extract_hashtags(tweet):
    # Returns hashtags in a list for a given tweet
    
    #tweet = tweet.replace('\xa0','')
    #tweet = tweet.replace('\n','')
    
    tweet_words = tweet.split(' ')
    tweet_words = [w for w in tweet_words if w!='']
    hashtags = []
    for word in tweet_words:
        if word[0]=='#':
            hashtags.append(word)
    return hashtags

def extract_emojis (tw):
    # Returns emojis in a list for a given tweet
    # Using Deque for a sliding window (emojis can be combined together to form other emojis)
    
    emojis = []
    
    l = []
    max_l = 7
    
    for i in range(0, max_l):
        l.append(tw[-1-i])
    l = deque(l, maxlen=max_l)
    skip=0
    
    for i in range (0, len(tw)):
        if skip == 0:
            for j in range (max_l-1, -1, -1):
                str_to_test = ''
                for k in range (0, j+1):
                    str_to_test+=l[j-k]
                if str_to_test in emoji.UNICODE_EMOJI['en']:
                    
                    emojis.append(str_to_test)
                    skip=j
                    break
            try:
                l.append(tw[-1-i-max_l])
            except IndexError:
                l.append('')
        else:
            skip=skip-1
            try:
                l.append(tw[-1-i-max_l])
            except IndexError:
                l.append('')
    emojis.reverse()
    return emojis

In [5]:
def to_lowercase(tweets):
    tweets_lowercase = []
    for tw in tweets:
        tweets_lowercase.append(tw.lower())
    return tweets_lowercase

def remove_stopwords(tweets):
    
    stop_words = set(stopwords.words('english'))
    
    tweets_no_stopwords = []
    for tw in tweets:
        tw = tw.split(' ')
        tweets_no_stopwords.append(' '.join([word for word in tw if not word in stop_words]))

    return tweets_no_stopwords

def remove_hashtags(tweets):
    tweets = [tw.replace('#', '') for tw in tweets]
    return tweets

def replace_emojis(tweets):
    tweets_no_emojis = []
    for tw in tweets:
        emojis = extract_emojis(tw)
        for e in emojis:
            e_text = emoji.UNICODE_EMOJI['en'][e].replace('_',' ').replace(':', '')
            tw = tw.replace(e, e_text)
        tweets_no_emojis.append(tw)

    return tweets_no_emojis
    
        

In [6]:
data_path = './data/task2/'
filelist = os.listdir(data_path)


df_list = [pd.read_csv(data_path+file) for file in filelist]


test_df = df_list[k]    
train_df = pd.concat(df_list[:k]+df_list[k+1:])


tw_train = train_df['tweet'].tolist()
tw_test = test_df['tweet'].tolist()

if all_data:
    df = pd.read_csv('./data/dev-full-task-2-clean.csv')
    tw_train = df['tweet']
    labels_train = df.iloc[:,1:10].values.tolist()

if clean_tweets_flag:
    tw_train = clean_tweets(tw_train)
    tw_test = clean_tweets(tw_test)

if replace_lowercase_flag:
    tw_train = to_lowercase(tw_train)
    tw_test = to_lowercase(tw_test)

if remove_stopwords_flag:
    tw_train = remove_stopwords(tw_train)
    tw_test = remove_stopwords(tw_test)

if remove_hashtags_flag:
    tw_train = remove_hashtags(tw_train)
    tw_test = remove_hashtags(tw_test)

if replace_emojis_flag:
    tw_train = replace_emojis(tw_train)
    tw_test = replace_emojis(tw_test)

weights_tmp = [0,0,0,0,0,0,0,0,0]
weights = [1, 1, 1, 1, 1, 1, 1, 1, 1]
if not all_data:
    labels_train = train_df.iloc[:,1:10].values.tolist()
labels_test = test_df.iloc[:,1:10].values.tolist()
ids_test = test_df['ids'].tolist()

for i in range(0, 9):
    for j in range(0, len(labels_train)):
        weights_tmp[i]+=labels_train[j][i]
        
if class_weights_flag:
    weights = [len(labels_train)/w for w in weights_tmp]
else:
    weights = [1,1,1,1,1,1,1,1,1]
#weights = [1/w for w in weights_tmp]

weights = torch.FloatTensor(weights).cuda()
weights

tensor([44.3929, 10.9035,  8.1242,  6.8297,  6.7189, 27.0217, 11.3000, 12.4300,
        19.4219], device='cuda:0')

In [7]:
if 'twitter' in model_name:
    tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_input = tokenizer(tw_train)

m = 0
for tokens in tokenized_input['input_ids']:
    if len(tokens)>m:
        m=len(tokens)
m

111

In [8]:
models_b_size = {'roberta-base':32,
                 'bert-base-cased':32,
                 'twitter':12,
                 'roberta-large':10}
batch_size = models_b_size[model_name]

if 'roberta' in model_name:
    train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)
    test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels, test_ids)
    
else:
    train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels, train_token_type_ids)
    test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels, test_token_type_ids, test_ids)

    
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [11]:
class BertClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        #self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=n_classes)
        self.sigmoid = nn.Sigmoid()
        #self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        #self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)
        
        if n_classes >1:
            self.criterion = nn.BCELoss()
        else:
            self.criterion = nn.MSELoss()
        
    def forward(self, input_ids, token_type_ids, input_mask, labels):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
        #outputs = self.classifier(outputs.pooler_output)
        
        logits = self.sigmoid(outputs[0])
        
        loss = self.criterion(logits, labels)
        loss = (loss * weights).mean()
        
        
        return loss, logits

class CovidTwitterBertClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        #self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=n_classes)
        self.sigmoid = nn.Sigmoid()
        #self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)
        
        if n_classes >1:
            self.criterion = nn.BCELoss(reduction='none')
        else:
            self.criterion = nn.MSELoss()
        
    def forward(self, input_ids, token_type_ids, input_mask, labels):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
        #outputs = self.classifier(outputs.pooler_output)
        logits = outputs[1]
        logits = self.sigmoid(logits)
        
        loss = self.criterion(logits, labels)
        loss = (loss * weights).mean()
        
        
        return loss, logits
    
    
class RobertaClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=n_classes)
        self.sigmoid = nn.Sigmoid()
        if n_classes >1:
            self.criterion = nn.BCELoss(reduction='none')
            #self.criterion = nn.BCEWithLogitsLoss()
            
        else:
            self.criterion = nn.MSELoss()
        
    def forward(self, input_ids, input_mask, labels):
        outputs = self.bert(input_ids, input_mask)
        #outputs = self.classifier(outputs.pooler_output)
        logits = outputs[0]
        logits = self.sigmoid(logits)
        
        if self.n_classes == 1:
            labels=labels.float()
        loss = self.criterion(logits, labels)
        loss = (loss * weights).mean()
        
        return loss, logits

In [12]:
if 'roberta' in model_name:
    model = RobertaClassifier(9)
elif 'twitter' in model_name:
    model = CovidTwitterBertClassifier(9)
else:
    model = BertClassifier(9)
    
    
model.cuda()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

RobertaClassifier(
  (bert): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (den

In [13]:
#optimizer_grouped_parameters
# lr 5e-5 for base models
# lr 7e-6 for larger models
optimizer = AdamW(model.parameters(),
                  lr=7e-6,
                  weight_decay = 0.001)

scheduler = ReduceLROnPlateau(optimizer, patience=4, factor=0.3)

In [14]:
def round_regression(val):
    if val<0.5:
        return 0
    elif val<1.5:
        return 1
    else:
        return 2

In [None]:
epochs = 30

threshold = 0.5

best_MCCF = 0
best_MCCA = 0
best_F1 = 0
best_MCCs = []
best_MCCNC = 0
best_loss = 999
best_acc = 0
best_state_dict = model.state_dict()
best_epoch = 0

for e in trange(epochs, desc="Epoch"):

    # Training

    model.train()

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)

        if 'roberta' in model_name:
            b_input_ids, b_input_mask, b_labels = batch
        else:    
            b_input_ids, b_input_mask, b_labels, b_token_type_ids = batch
            
        if not classification:
            b_labels = b_labels.view(-1, 1)        
        
        b_labels = b_labels.float()
        optimizer.zero_grad()
        
        if 'roberta' in model_name:
            outputs = model(b_input_ids, b_input_mask, b_labels)
        else:
            outputs = model(b_input_ids, b_token_type_ids, b_input_mask, b_labels)
        loss = outputs[0]
        logits = outputs[1]
        #print(step, loss.item())

        loss.backward()
        optimizer.step()


        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Testing
    
    model.eval()
    
    tweets_test = []
    
    predictions = []
    predictions_sep = [[], [], [], [], [], [], [], [], []]
    
    labels = []
    labels_sep = [[], [], [], [], [], [], [], [], []]
    
    eval_loss = 0
    steps=0
    # Train the data for one epoch
    for step, batch in enumerate(test_dataloader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        if 'roberta' in model_name:
            b_input_ids, b_input_mask, b_labels, ids = batch
        else:    
            b_input_ids, b_input_mask, b_labels, b_token_type_ids, ids = batch
            
        if not classification:
            b_labels = b_labels.view(-1, 1)        
        
        b_labels = b_labels.float()
        
        with torch.no_grad():

            if 'roberta' in model_name:
                outputs = model(b_input_ids, b_input_mask, b_labels)
            else:
                outputs = model(b_input_ids, b_token_type_ids, b_input_mask, b_labels)
            logits = outputs[1]
            loss = outputs[0]


        logits = logits.detach().cpu().numpy()
        ground_truth = b_labels.detach().cpu().numpy()
        
        steps+=1
        eval_loss+=loss.detach().item()
        
        tweets_test.append(b_input_ids)
        for p in logits:
            if classification:
                pred = p.argmax()
            else:
                pred = round_regression(p)
            predictions.append(p>threshold)
            for i in range(0, 9):
                predictions_sep[i].append(p[i]>threshold)
            
        for gt in ground_truth:
            labels.append(gt>threshold)
            for i in range(0, 9):
                labels_sep[i].append(gt[i]>threshold)
        
    MCCs = []
    for i in range(0, 9):
        MCCs.append(metrics.matthews_corrcoef(labels_sep[i], predictions_sep[i]))
    labels_one = []
    predictions_one = []
    for l in labels:
        if list(l) == [False, False, False, False, False, False, False, False, False]:
            labels_one.append(0)
        else:
            labels_one.append(1)
    for p in predictions:
        if list(p) == [False, False, False, False, False, False, False, False, False]:
            predictions_one.append(0)
        else:
            predictions_one.append(1)
    
    
    scheduler.step(eval_loss/steps)
    MCCF = metrics.matthews_corrcoef(np.array(labels).flatten(), np.array(predictions).flatten())
    ACC = metrics.accuracy_score(labels, predictions)
    LOSS = eval_loss/steps
    MCCNC = metrics.matthews_corrcoef(labels_one, predictions_one)
    F1 = metrics.f1_score(labels, predictions, average='weighted')
    MCCA = np.array(MCCs).mean()
    if MCCA> best_MCCA:
        best_MCCF = MCCF
        best_MCCA = MCCA
        best_loss = LOSS
        best_acc = ACC
        best_F1 = F1
        best_MCCs = MCCs
        best_MCCNC = MCCNC
        best_state_dict = copy.deepcopy(model.state_dict())
        best_epoch = e
    
    print("\t Eval loss: {}".format(LOSS))
    print("\t Eval ACC: {}".format(ACC))
    print("\t Eval MCCA: {}".format(MCCA))
    print("\t Eval MCCF: {}".format(MCCF))
    print("\t Eval MCCs: {}".format(MCCs))
    print("\t Eval MCC 1 vs other: {}".format(MCCNC))
    #print("\t Eval Kappa: {}".format(metrics.cohen_kappa_score(np.array(labels).flatten(), np.array(predictions).flatten())))
    print("\t Eval F1 weighted: {}".format(F1))
    #print("\t Eval F1 micro: {}".format(metrics.f1_score(labels, predictions, average='micro')))
    #print("\t Eval F1 samples: {}".format(metrics.f1_score(labels, predictions, average='samples')))
    #print("\t Eval F1 None: {}".format(metrics.f1_score(labels, predictions, average=None)))
    
    #print([predictions.count(i) for i in range(0,3)], [labels.count(i) for i in range(0, 3)])
    


Epoch:   0%|                                                 | 0/30 [00:00<?, ?it/s]

Train loss: 4.15667368221283


Epoch:   3%|█▎                                    | 1/30 [03:34<1:43:28, 214.08s/it]

	 Eval loss: 3.38576590269804
	 Eval ACC: 0.4855305466237942
	 Eval MCCA: 0.0
	 Eval MCCF: 0.0
	 Eval MCCs: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
	 Eval MCC 1 vs other: 0.0
	 Eval F1 weighted: 0.0
Train loss: 3.2818857460021973


Epoch:   7%|██▌                                   | 2/30 [07:08<1:39:54, 214.11s/it]

	 Eval loss: 2.694883955642581
	 Eval ACC: 0.49517684887459806
	 Eval MCCA: 0.16344438884748153
	 Eval MCCF: 0.25266826996359093
	 Eval MCCs: [0.0, 0.25577021009073103, 0.40414003022006, 0.0, 0.0, 0.8110892593165426, 0.0, 0.0, 0.0]
	 Eval MCC 1 vs other: 0.23360346365431425
	 Eval F1 weighted: 0.10810495286460613
Train loss: 2.500204922199249


Epoch:  10%|███▊                                  | 3/30 [10:41<1:36:13, 213.84s/it]

	 Eval loss: 2.13862815964967
	 Eval ACC: 0.5530546623794212
	 Eval MCCA: 0.4078677103884172
	 Eval MCCF: 0.5048323020601615
	 Eval MCCs: [0.0, 0.7333542672347239, 0.5197391466671093, 0.0, 0.0, 0.9098330807202929, 0.7856140711454555, 0.7222688277281736, 0.0]
	 Eval MCC 1 vs other: 0.4412783365849191
	 Eval F1 weighted: 0.3724528034800572
Train loss: 1.882095663547516


Epoch:  13%|█████                                 | 4/30 [14:15<1:32:37, 213.76s/it]

	 Eval loss: 1.8714201068505645
	 Eval ACC: 0.5852090032154341
	 Eval MCCA: 0.5162407828255708
	 Eval MCCF: 0.5446122067370258
	 Eval MCCs: [0.5301802308720894, 0.7659710913812849, 0.5581446400094104, 0.0, 0.0, 0.8151982686453727, 0.8970672676222476, 0.72222082374386, 0.3573847231558724]
	 Eval MCC 1 vs other: 0.4977314829734596
	 Eval F1 weighted: 0.4139253603086656


In [None]:
print('Best epoch ', best_epoch)
print("\t Eval loss: {}".format(best_loss))
print("\t Eval ACC: {}".format(best_acc))
print("\t Eval MCCA: {}".format(best_MCCA))
print("\t Eval MCCF: {}".format(best_MCCF))
print("\t Eval MCCs: {}".format(best_MCCs))
print("\t Eval MCC 1 vs other: {}".format(best_MCCNC))
print("\t Eval F1 weighted: {}".format(best_F1))

In [None]:
#torch.save(best_state_dict, '../Models/task2/'+model_name+'_cv'+str(k)+'.pth')