In [None]:
!pip install emoji

In [None]:
import os
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
from torch.nn import functional as F
from sklearn.metrics import (f1_score, recall_score, accuracy_score,
                                precision_score)
from transformers import (get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer,
                            AutoModelForSequenceClassification)
from torch.utils.data import (TensorDataset,DataLoader,
                             RandomSampler, SequentialSampler, Dataset)





In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Preprocessing

For BERTweet we will only load the data and do not perform any preprocessing at all (even links + usernames will not be removed)

In [1]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")
        
        
def convert_prediction(pred):
    if pred == 1:
        return "rumour"
    elif pred == 0:
        return "non-rumour"
    else:
        raise Exception("prediction classes must be '0' or '1'")

In [None]:
def load_data(data_file, label_file):
    
    if label_file != None:
        y_true = json.load(open(label_file))
    
    with open(data_file, 'r') as data_train:
        raw_list = list(data_train)

    data_list = []


    for event in raw_list:
        tweets_in_event = json.loads(event)

        tweet = {}

        tweet['id'] = tweets_in_event[0]['id']
        tweet['text'] = tweets_in_event[0]['text']
        

        # append text from follow-up tweets in tweet chain
        follow_up_tweets = ""
        for i in range(1, len(tweets_in_event)):
            follow_up_tweets = follow_up_tweets + tweets_in_event[i]['text'] + " "
        
        # Concatenate text from all tweets in field 'text'
        tweet['text'] = tweet['text'] + " " + follow_up_tweets

        
        tweet['text'] = tweet['text'].strip()
        if label_file != None:
            tweet['label'] = convert_label(y_true[str(tweet['id'])])
        
        data_list.append(tweet)

    df = pd.DataFrame(data_list)

    return df

In [None]:
train_df = load_data(data_file = '../data/train.data.jsonl', label_file = '../data/train.label.json')
dev_df = load_data(data_file = '../data/dev.data.jsonl', label_file = '../data/dev.label.json')
test_df = load_data(data_file = '../data/test.data.jsonl', label_file = None)

## BERTweet

In [None]:
def bert_encode(df, tokenizer):
    input_ids = []
    attention_masks = []
    for sent in df[["text"]].values:
        sent = sent.item()
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 128,           
                            pad_to_max_length = True,
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
           
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

In [None]:
def prepare_dataloaders(train_df,test_df,dev_df,batch_size=8):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    tweet_valid = bert_encode(dev_df, tokenizer)
    tweet_valid_labels = dev_df.label.astype(int)
    
    tweet_train = bert_encode(train_df, tokenizer)
    tweet_train_labels = train_df.label.astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)


    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_valid.values()
    labels = torch.tensor(tweet_valid_labels.values)
    val_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )


    validation_dataloader = DataLoader(
                val_dataset, 
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size 
            )


    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    return train_dataloader,validation_dataloader,test_dataloader



In [None]:
train_dataloader,validation_dataloader,test_dataloader = prepare_dataloaders(train_df, test_df, dev_df)

## Experiments

In [None]:
def test_encode(sentence):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    encoded_dict = tokenizer.encode_plus(
                        sentence,                      
                        add_special_tokens = True, 
                        max_length = 128,           
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                )
           
    return encoded_dict['input_ids']

In [3]:
def test_decode(tokens):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)
    return tokenizer.convert_ids_to_tokens(tokens)



In [None]:
train_df.text[0]

In [None]:
text_test = train_df.text[0]
text_preprocessed = test_encode(text_test)


print(f'Shape      : {text_preprocessed.shape}')
print(f'Word Ids   : {text_preprocessed[0, :128]}')

In [None]:
text_test = train_df.text[0]
text_preprocessed = test_encode(text_test)


print(f'Shape      : {text_preprocessed.shape}')
print(f'Word Ids   : {text_preprocessed[0, :128]}')
print(test_decode(text_preprocessed[0, :128]))

## Training

In [None]:
def prepare_model(model_class="vinai/bertweet-base",num_classes=2,model_to_load=None,total_steps=-1):



    model = AutoModelForSequenceClassification.from_pretrained(
        model_class,
        num_labels = num_classes,  
        output_attentions = False, 
        output_hidden_states = False,
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("LOADED MODEL")
        except:
            pass
    return model, optimizer, scheduler

In [None]:
epochs = 5
total_steps = len(train_dataloader) * epochs

model, optimizer, scheduler = prepare_model("vinai/bertweet-base" ,num_classes=2, model_to_load=None, total_steps = total_steps)

In [None]:
def validate(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    t0 = time.time()
    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        preds.append(logits)
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("  Accuracy: {0:.2f} %".format(avg_val_accuracy*100))
    avg_val_loss = total_eval_loss / len(test_dataloader)
    validation_time = format_time(time.time() - t0)
    print("  Test Loss: {0:.2f}".format(avg_val_loss))
    print("  Test took: {:}".format(validation_time))
    return preds, avg_val_accuracy, avg_val_loss, validation_time

In [None]:
def train(model,optimizer,scheduler,train_dataloader,validation_dataloader,epochs):
    seed_val = 42
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
            
        _, avg_val_accuracy, avg_val_loss, validation_time = validate(model,validation_dataloader)
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




In [None]:
train(model,optimizer,scheduler,train_dataloader,validation_dataloader, epochs)

In [None]:
torch.save(model.cpu().roberta.state_dict(),"./bertweet/bertweet_v21")

## Inference

In [None]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [None]:
result = predict(model,test_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)

## BERTweet with merged dataset (i.e. train + dev data has been merged)

In [None]:
combined_df = train_df.append(dev_df, ignore_index = True)
combined_df

In [None]:
combined_df.text[0]

In [None]:
def prepare_dataloaders(combined_df,test_df,tokenizer_class="vinai/bertweet-base",batch_size=8):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_class, use_fast=False, normalization=True)

    
    tweet_train = bert_encode(combined_df, tokenizer)
    tweet_train_labels = combined_df.label.astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)


    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )

    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    return train_dataloader,test_dataloader

In [None]:
train_dataloader,test_dataloader = prepare_dataloaders(combined_df, test_df)

In [None]:
def prepare_model(model_class="vinai/bertweet-base",num_classes=2,model_to_load=None,total_steps=-1):

    model = AutoModelForSequenceClassification.from_pretrained(
        model_class,
        num_labels = num_classes,  
        output_attentions = False, 
        output_hidden_states = False,
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("LOADED MODEL")
        except:
            pass
    return model, optimizer, scheduler

In [None]:
epochs = 5
total_steps = len(train_dataloader) * epochs

model, optimizer, scheduler = prepare_model("vinai/bertweet-base" ,num_classes=2, model_to_load=None, total_steps = total_steps)

In [None]:
def train(model,optimizer,scheduler,train_dataloader,epochs):
    seed_val = 42
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
train(model,optimizer,scheduler,train_dataloader, epochs)

In [None]:
torch.save(model.cpu().roberta.state_dict(),"./bertweet/bertweet_v28")

## Inference

In [None]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [None]:
result = predict(model,test_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

In [None]:
predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output_v27.json', 'w') as f:
    json.dump(submission, f)