In [1]:
import re
import IPython
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import AdamW, get_linear_schedule_with_warmup
from datetime import datetime

**1. Preprocess datasets**

- Import BERT Tokenizer + add tokens to mask URLs and usernames
- Basic data preprocessing : get rid of tags, links and usernames
- Bert preprocessing : tokenize, create inputs and attention masks
- Form train and test datasets (in the correct format)

In [2]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Adding additional tokens for masking URLs and usernames in tweets
bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[LINK]', '[USER]']})
bert_tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[LINK]', '[USER]']})

In [3]:
def bert_tokenize(df, tokenizer=bert_tokenizer, max_seq_len = 100):
    input_sequences = []
    # The attention mask is an optional argument used when batching sequences together.
    # The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
    attention_masks = []
    bert_text = []
    
    # some very minor text processing, try to keep the text as close as original
    for i, text in enumerate(df['text']):
#         print(i, text)
        text = text.replace("\n", " ").split(" ")
        text = [word if "http" not in word else "[LINK]" for word in text]
        text = [word if "@" not in word else "[USER]" for word in text]
        text = " ".join(text)
        text = re.sub(r'#', '', text)
        bert_text.append(text)
        
#         print(i, text)
        sequence_dict = tokenizer.encode_plus(text, max_length=max_seq_len, pad_to_max_length=True)
        input_ids = sequence_dict['input_ids']
        att_mask = sequence_dict['attention_mask']
#         print(i, tokenizer.tokenize(text))
        input_sequences.append(input_ids)
        attention_masks.append(att_mask)
    
    df['bert_text'] = bert_text
    return input_sequences, attention_masks, df

In [4]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

train_X, train_att, train_df = bert_tokenize(train_df)
train_y = train_df['target'].values
test_X, test_att, test_df = bert_tokenize(test_df)

# Checking the tokenized format
print(train_X[0])
print(train_att[0])
print(test_X[0])
print(test_att[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[101, 2256, 15616, 2024, 1996, 3114, 1997, 2023, 8372, 2089, 16455, 9641, 2149, 2035, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2074, 3047, 1037, 6659, 2482, 5823, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0,

In [5]:
## Forming the datasets
train_X = torch.tensor(train_X)
train_y = torch.tensor(train_y)
train_att = torch.tensor(train_att)
test_X = torch.tensor(test_X)
test_att = torch.tensor(test_att)

In [6]:
batch_size = 32
train_data = torch.utils.data.TensorDataset(train_X, train_att, train_y)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = torch.utils.data.TensorDataset(test_X, test_att)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

**2. Adapt and Train Bert model**

- Resize token embeddings (since we have added two special ones)
- Define device on which the training will take place (cuda or cpu)
- Define train and test functions

In [7]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.resize_token_embeddings(len(bert_tokenizer))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Embedding(30524, 768)

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
IPython.display.clear_output()

In [9]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fct = torch.nn.NLLLoss()

In [10]:
# from torch.utils.tensorboard import SummaryWriter

# # Writer will output to ./runs/ directory by default
# writer = SummaryWriter(logdir)

# training loop
# train_loss += loss_train.item()
# writer.add_scalar('Loss/train', training_loss, global_step)

# testing
# val_loss += loss_val.item()
# writer.add_scalar('Loss/val', val_loss, global_step)

# writer.close()

In [14]:
from statistics import mean

def train(epoch):
    acc_list = []
    loss_list = []
    t0 = datetime.now()
    model.train()
    for i, batch in enumerate(train_dataloader, start=1):
        batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        #print(batch)
        model.zero_grad()  
        
        logits = model(inputs, attention_mask=att_masks)
        outputs = F.log_softmax(logits[0], dim=1)
        #print(outputs)
        
        loss = loss_fct(outputs.view(-1, 2), labels.view(-1))
        loss.backward()
        loss_list.append(loss.item())

        pred_outputs = outputs.detach().cpu().numpy()
        #print(pred_outputs)
        pred = np.argmax(pred_outputs, axis=1)
        labels = labels.cpu().numpy()
        acc_list.append(accuracy_score(pred, labels))
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        
        if i % 20 == 0:
            print('Train Epoch: {} [{}/{} ({:.0%})] - Elapsed: {}  |  Loss: {:.4f}'.format(
                epoch, i * len(inputs), len(train_dataloader.dataset),
                    i / len(train_dataloader), datetime.now() - t0, loss.item()
            ))
    return 100*mean(acc_list), mean(loss_list)

In [15]:
def test():
    t0 = datetime.now()
    model.eval()
    test_loss, test_acc = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        with torch.no_grad():
            logits = model(inputs, attention_mask=att_masks)
            outputs = F.log_softmax(logits[0], dim=1)
            
            loss = loss_fct(outputs.view(-1, 2), labels.view(-1))

        test_loss += loss.item()
        outputs = outputs.detach().cpu().numpy()

        pred = np.argmax(outputs, axis=1)
        labels = labels.cpu().numpy()
        
        test_acc += accuracy_score(pred, labels)

    test_loss /= len(test_dataloader)
    test_acc /= len(test_dataloader)
    print('\nTest set: Loss: {:.4f}, Accuracy: {:.1%} - Elapsed: {}\n'.format(
        test_loss, test_acc, datetime.now() - t0
    ))

In [16]:
from torch.utils.tensorboard import SummaryWriter
model_name = 'network'
log_name = '{}_{}'.format(model_name, datetime.now().strftime('%Y%m%d_%H%M%S'))
writer = SummaryWriter('logs/{}'.format(log_name))

nb_epoch = 1

time_list = []
for epoch in range(1, nb_epoch+1):
    t0 = datetime.now()
    accuracy, avg_loss = train(epoch)
    time_list.append((datetime.now()-t0).seconds)
    writer.add_scalar('loss/train', avg_loss, epoch)
    writer.add_scalar('acc/train', accuracy, epoch)
print("Training time per epoch :")
print(list(zip(range(1,nb_epoch+1),time_list)))
total_time = round(sum(time_list)/60,2)
print(f"Total training time for {nb_epoch} epochs : {total_time} minutes")
#     test()

Training time per epoch :
[(1, 71378)]
Total training time for 1 epochs : 1189.63 minutes


In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
# epochs=range(len(acc_list))
# plt.plot(epochs, acc_list, 'r', 'Training F1')
# plt.plot(epochs, val_f1, 'b', 'Validation F1')
# plt.title('Training and validation F1')
# plt.figure()
# plt.plot(epochs, loss_list, 'r', 'Training Loss')
# plt.plot(epochs, val_loss, 'b', 'Validation Loss')
# plt.title('Training and validation loss')
# plt.figure()

**3. Predict on Test data**

- Define predict function
- Generate submission file

In [None]:
def predict(text):
    # pre-process text
    input_ = torch.tensor(bert_tokenizer.encode(text)).unsqueeze(0).to(device)
    logits = model.eval()(input_ids=input_)[0]
    pred = F.softmax(logits, dim=1)[0]
    return pred

In [None]:
predictions = []
for text in test_df.text:
    prob = predict(text)
    pred = np.argmax(prob.cpu().detach().numpy())
    predictions.append(pred)

In [None]:
sample_submission = pd.read_csv("data/sample_submission.csv")
# bert
sample_submission["target"] = predictions
pd.merge(sample_submission, test_df, on=['id']).sample(frac=1).head(10)
sample_submission.to_csv("submission.csv", index=False)