In [None]:
!pip install emoji

In order to run BERTweet, you will need to install latest version of transformers:
* git clone https://github.com/huggingface/transformers.git
* cd transformers
* pip3 install --upgrade .

In [7]:
import os
import json
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
from torch.nn import functional as F
from sklearn.metrics import (f1_score, recall_score, accuracy_score,
                                precision_score)
from transformers import (get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer,
                            AutoModelForSequenceClassification)
from torch.utils.data import (TensorDataset,DataLoader,
                             RandomSampler, SequentialSampler, Dataset)


Setup some helper functions

In [8]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Preprocessing

For BERTweet we will only load the data and do not perform any preprocessing at all (even links + usernames will not be removed)

In [9]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")
        
        
def convert_prediction(pred):
    if pred == 1:
        return "rumour"
    elif pred == 0:
        return "non-rumour"
    else:
        raise Exception("prediction classes must be '0' or '1'")

In [10]:
def load_data(data_file, label_file):
    
    if label_file != None:
        y_true = json.load(open(label_file))
    
    with open(data_file, 'r') as data_train:
        raw_list = list(data_train)

    data_list = []


    for event in raw_list:
        tweets_in_event = json.loads(event)

        tweet = {}

        tweet['id'] = tweets_in_event[0]['id']
        tweet['text'] = tweets_in_event[0]['text']
        

        # append text from follow-up tweets in tweet chain
        follow_up_tweets = ""
        for i in range(1, len(tweets_in_event)):
            follow_up_tweets = follow_up_tweets + tweets_in_event[i]['text'] + " "
        
        # Concatenate text from all tweets in field 'text'
        tweet['text'] = tweet['text'] + " " + follow_up_tweets

        
        tweet['text'] = tweet['text'].strip()
        if label_file != None:
            tweet['label'] = convert_label(y_true[str(tweet['id'])])
        
        data_list.append(tweet)

    df = pd.DataFrame(data_list)

    return df

In [11]:
train_df = load_data(data_file = '../data/train.data.jsonl', label_file = '../data/train.label.json')
dev_df = load_data(data_file = '../data/dev.data.jsonl', label_file = '../data/dev.label.json')
test_df = load_data(data_file = '../data/test.data.jsonl', label_file = None)

# BERTweet

In [12]:
def bert_encode(df, tokenizer):
    input_ids = []
    attention_masks = []
    for sent in df[["text"]].values:
        sent = sent.item()
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 128,           
                            pad_to_max_length = True,
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
           
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

In [13]:
def prepare_dataloaders(train_df,test_df,dev_df,batch_size=8):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    tweet_valid = bert_encode(dev_df, tokenizer)
    tweet_valid_labels = dev_df.label.astype(int)
    
    tweet_train = bert_encode(train_df, tokenizer)
    tweet_train_labels = train_df.label.astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)


    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_valid.values()
    labels = torch.tensor(tweet_valid_labels.values)
    val_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )


    validation_dataloader = DataLoader(
                val_dataset, 
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size 
            )


    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    return train_dataloader,validation_dataloader,test_dataloader



In [14]:
train_dataloader,validation_dataloader,test_dataloader = prepare_dataloaders(train_df, test_df, dev_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Experiments

In [73]:
def test_encode(sentence):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    encoded_dict = tokenizer.encode_plus(
                        sentence,                      
                        add_special_tokens = True,            
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                )
           
    return encoded_dict['input_ids']

In [74]:
def test_decode(tokens):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)
    return tokenizer.convert_ids_to_tokens(tokens)



In [75]:
train_df.text[0]

"How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. http://t.co/sC2ot63F6j @Heresy_Corner @KrustyAllslopp \nJews label anyone they don't like as Anti-Semite and campaign until that person/company is finished. @Heresy_Corner @KrustyAllslopp \nNo one does. @Heresy_Corner #ImCharlieHebdo @KrustyAllslopp Ditto @Grizzly_Stats @tom_wein What innocent Muslims ought to find insulting is an atrocity committed in their name, not a sodding cartoon. @Heresy_Corner @KrustyAllslopp \nYes, until it becomes yours. @Heresy_Corner @KrustyAllslopp \nWhy insult people who have nothing to do with this? People are genuinely offended by such drawings. @KrustyAllslopp @Heresy_Corner \nAnd neither am I! I think this has little to do with actual Muslims. @berg_han Ah, you don't like Jews. Bye bye. @KrustyAllslopp @Heresy_Corner Also they kid you along with benign stuff then ... WHAM it's like a river of shite! @berg_han @Heresy_Corner It's a good point 

In [79]:
text_test = train_df.text[0]
text_preprocessed = test_encode(text_test)


print(f'Shape      : {text_preprocessed.shape}')
print(f'Word Ids   : {text_preprocessed}')
print(test_decode(text_preprocessed[0, :128]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Shape      : torch.Size([1, 128])
Word Ids   : tensor([[    0,   203,     9,  3924,     9,     6, 44766,  1545,    24,  5580,
         44911,    32,    21,  1077,  8414,    16,     6,   336,   220,   151,
          4197,    33,     4,    10,     5,     5,  7630,  6651,   400,    59,
            32,    29,    43,    52,  7756,  9529,  2646,    13,  2184,   350,
            25,   282,    75,  1151,    17,  1368,     4,     5,     5,   218,
            63,   158,     4,     5,  8170,  5145, 24251, 44911,    32,     5,
         47813,     5,     5,   165,  4520,  5636,  8789,     9,   259, 15386,
            17,    74, 36802,   862,  6099,    16,   130,   330,     7,    46,
            11,  2266, 32342,  9523,     4,     5,     5,   699,     7,   350,
            18,  3655,  1103,     4,     5,     5,   250,  7967,    83,    87,
            36,   349,     9,    32,    30,    33,    21,   603,    41,  5990,
          7256,    61,   367, 17654,     4,     5,     5,   159,  4285,   155,
     

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['<s>', 'How', 'to', 'respond', 'to', 'the', 'murderous', 'attack', 'on', 'Charlie', 'Heb@@', 'do', '?', 'Every', 'newspaper', 'in', 'the', 'free', 'world', 'should', 'print', 'this', '.', 'HTTPURL', '@USER', '@USER', 'Jews', 'label', 'anyone', 'they', 'do', "n't", 'like', 'as', 'Anti-@@', 'Sem@@', 'ite', 'and', 'campaign', 'until', 'that', 'person', '/', 'company', 'is', 'finished', '.', '@USER', '@USER', 'No', 'one', 'does', '.', '@USER', '#Im@@', 'Char@@', 'lie@@', 'Heb@@', 'do', '@USER', 'Ditto', '@USER', '@USER', 'What', 'innocent', 'Muslims', 'ought', 'to', 'find', 'insulting', 'is', 'an', 'atro@@', 'city', 'committed', 'in', 'their', 'name', ',', 'not', 'a', 'so@@', 'dding', 'cartoon', '.', '@USER', '@USER', 'Yes', ',', 'until', 'it', 'becomes', 'yours', '.', '@USER', '@USER', 'Why', 'insult', 'people', 'who', 'have', 'nothing', 'to', 'do', 'with', 'this', '?', 'People', 'are', 'genuinely', 'offended', 'by', 'such', 'drawings', '.', '@USER', '@USER', 'And', 'neither', 'am', 'I',

## Training

In [20]:
def prepare_model(model_class="vinai/bertweet-base",num_classes=2,model_to_load=None,total_steps=-1):



    model = AutoModelForSequenceClassification.from_pretrained(
        model_class,
        num_labels = num_classes,  
        output_attentions = False, 
        output_hidden_states = False,
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("LOADED MODEL")
        except:
            pass
    return model, optimizer, scheduler

In [None]:
epochs = 5
total_steps = len(train_dataloader) * epochs

model, optimizer, scheduler = prepare_model("vinai/bertweet-base" ,num_classes=2, model_to_load=None, total_steps = total_steps)

## Training

In [None]:
def validate(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    t0 = time.time()
    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        preds.append(logits)
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("  Accuracy: {0:.2f} %".format(avg_val_accuracy*100))
    avg_val_loss = total_eval_loss / len(test_dataloader)
    validation_time = format_time(time.time() - t0)
    print("  Test Loss: {0:.2f}".format(avg_val_loss))
    print("  Test took: {:}".format(validation_time))
    return preds, avg_val_accuracy, avg_val_loss, validation_time

In [None]:
def train(model,optimizer,scheduler,train_dataloader,validation_dataloader,epochs):
    seed_val = 42
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
            
        _, avg_val_accuracy, avg_val_loss, validation_time = validate(model,validation_dataloader)
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




In [None]:
train(model,optimizer,scheduler,train_dataloader,validation_dataloader, epochs)

In [None]:
torch.save(model.cpu().roberta.state_dict(),"./bertweet/bertweet_v21")

## Inference

In [None]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [None]:
result = predict(model,test_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)

# BERTweet with merged dataset (i.e. train + dev data has been merged)

In [41]:
combined_df = train_df.append(dev_df, ignore_index = True)
combined_df

Unnamed: 0,id,text,label
0,552800070199148544,How to respond to the murderous attack on Char...,0
1,544388259359387648,"You can't condemn an entire race, nation or re...",0
2,552805970536333314,Attempts to extend blame for this to all Musli...,0
3,525071376084791297,"Rest in Peace, Cpl. Nathan Cirillo. Killed tod...",1
4,498355319979143168,People DEBATING whether #MikeBrown shoplifted ...,0
...,...,...,...
5216,525025279803424768,The soldier shot dead in Wednesday's Ottawa at...,1
5217,552784600502915072,Charlie Hebdo became well known for publishing...,0
5218,499696525808001024,We got through. That's a sniper on top of a ta...,0
5219,580320612155060224,Last position of Germanwings flight #4U9525 at...,1


In [42]:
combined_df.text[0]

"How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. http://t.co/sC2ot63F6j @Heresy_Corner @KrustyAllslopp \nJews label anyone they don't like as Anti-Semite and campaign until that person/company is finished. @Heresy_Corner @KrustyAllslopp \nNo one does. @Heresy_Corner #ImCharlieHebdo @KrustyAllslopp Ditto @Grizzly_Stats @tom_wein What innocent Muslims ought to find insulting is an atrocity committed in their name, not a sodding cartoon. @Heresy_Corner @KrustyAllslopp \nYes, until it becomes yours. @Heresy_Corner @KrustyAllslopp \nWhy insult people who have nothing to do with this? People are genuinely offended by such drawings. @KrustyAllslopp @Heresy_Corner \nAnd neither am I! I think this has little to do with actual Muslims. @berg_han Ah, you don't like Jews. Bye bye. @KrustyAllslopp @Heresy_Corner Also they kid you along with benign stuff then ... WHAM it's like a river of shite! @berg_han @Heresy_Corner It's a good point 

In [43]:
def prepare_dataloaders(combined_df,test_df,tokenizer_class="vinai/bertweet-base",batch_size=8):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_class, use_fast=False, normalization=True)

    
    tweet_train = bert_encode(combined_df, tokenizer)
    tweet_train_labels = combined_df.label.astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)


    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )

    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    return train_dataloader,test_dataloader

In [44]:
train_dataloader,test_dataloader = prepare_dataloaders(combined_df, test_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [45]:
def prepare_model(model_class="vinai/bertweet-base",num_classes=2,model_to_load=None,total_steps=-1):

    model = AutoModelForSequenceClassification.from_pretrained(
        model_class,
        num_labels = num_classes,  
        output_attentions = False, 
        output_hidden_states = False,
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("LOADED MODEL")
        except:
            pass
    return model, optimizer, scheduler

In [57]:
epochs = 1
total_steps = len(train_dataloader) * epochs

model, optimizer, scheduler = prepare_model("vinai/bertweet-base" ,num_classes=2, model_to_load="./bertweet/bertweet_v18", total_steps = total_steps)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

LOADED MODEL


In [58]:
def train(model,optimizer,scheduler,train_dataloader,epochs):
    seed_val = 42
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [59]:
train(model,optimizer,scheduler,train_dataloader, 1)


Training...
  Batch    40  of    653.    Elapsed: 0:00:16.
  Batch    80  of    653.    Elapsed: 0:00:33.
  Batch   120  of    653.    Elapsed: 0:00:49.
  Batch   160  of    653.    Elapsed: 0:01:05.
  Batch   200  of    653.    Elapsed: 0:01:21.
  Batch   240  of    653.    Elapsed: 0:01:38.
  Batch   280  of    653.    Elapsed: 0:01:54.
  Batch   320  of    653.    Elapsed: 0:02:10.
  Batch   360  of    653.    Elapsed: 0:02:27.
  Batch   400  of    653.    Elapsed: 0:02:43.
  Batch   440  of    653.    Elapsed: 0:02:59.
  Batch   480  of    653.    Elapsed: 0:03:16.
  Batch   520  of    653.    Elapsed: 0:03:32.
  Batch   560  of    653.    Elapsed: 0:03:48.
  Batch   600  of    653.    Elapsed: 0:04:05.
  Batch   640  of    653.    Elapsed: 0:04:21.

  Average training loss: 0.16
  Training epcoh took: 0:04:26

Training complete!
Total training took 0:04:26 (h:mm:ss)


In [60]:
torch.save(model.cpu().roberta.state_dict(),"./bertweet/bertweet_v35")

## Inference

In [51]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [61]:
result = predict(model,test_dataloader)

In [62]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

In [63]:
predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

Unnamed: 0,id,target
0,544382249178001408,rumour
1,525027317551079424,rumour
2,544273220128739329,rumour
3,499571799764770816,non-rumour
4,552844104418091008,non-rumour
...,...,...
576,553581227165642752,rumour
577,552816302780579840,non-rumour
578,580350000074457088,rumour
579,498584409055174656,non-rumour


In [64]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)

## Inference on COVID dataset

In [65]:
covid_df = load_data(data_file = '../data/covid.data.jsonl', label_file = None)



In [66]:
covid_df

Unnamed: 0,id,text
0,1272262651100434433,"According to the New York Times, Warner Bros. ..."
1,1287153210990395392,Hurricane Hanna has made landfall in Texas.\n\...
2,1266555444283179008,Monkeys on the loose in India with stolen coro...
3,1257715199655755779,Eastleigh and Swahili Arabs in Mombasa where c...
4,1274338812173393920,"“If Trump felt comfortable having it here, the..."
...,...,...
17453,1249502859185590272,I wonder how many lives could’ve been saved if...
17454,1284050414619459586,The @thetimes front page on 17th March. The fi...
17455,1274505289614725122,Trump just completed the racism trifecta in a ...
17456,1267884642637676545,Here are a few of my photographs from today’s ...


In [67]:
train_dataloader,covid_dataloader = prepare_dataloaders(combined_df, covid_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [68]:
result = predict(model,covid_dataloader)

In [69]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

In [70]:
pred_labels

array([1, 0, 0, ..., 0, 0, 0])

In [71]:
predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':covid_df.id,'target':predicted_labels})
output

Unnamed: 0,id,target
0,1272262651100434433,rumour
1,1287153210990395392,non-rumour
2,1266555444283179008,non-rumour
3,1257715199655755779,non-rumour
4,1274338812173393920,non-rumour
...,...,...
17453,1249502859185590272,non-rumour
17454,1284050414619459586,non-rumour
17455,1274505289614725122,non-rumour
17456,1267884642637676545,non-rumour


In [72]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('covid-output.json', 'w') as f:
    json.dump(submission, f)