In [1]:
import re
import IPython
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import AdamW, get_linear_schedule_with_warmup
from datetime import datetime

**1. Preprocess datasets**

- Import BERT Tokenizer + add tokens to mask URLs and usernames
- Basic data preprocessing : get rid of tags, links and usernames
- Bert preprocessing : tokenize, create inputs and attention masks
- Form train and test datasets (in the correct format)

In [2]:
def define_tokenizer():
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    ## Adding additional tokens for masking URLs and usernames in tweets
    bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[LINK]', '[USER]']})
    return bert_tokenizer

In [3]:
def bert_tokenize(df, tokenizer, max_seq_len = 100):
    input_sequences = []
    # The attention mask is an optional argument used when batching sequences together.
    # The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
    attention_masks = []
    bert_text = []
    
    # some very minor text processing (try to keep the text as close as original)
    for i, text in enumerate(df['text']):
#         print(i, text)
        text = text.replace("\n", " ").split(" ")
        text = [word if "http" not in word else "[LINK]" for word in text]
        text = [word if "@" not in word else "[USER]" for word in text]
        text = " ".join(text)
        text = re.sub(r'#', '', text)
        bert_text.append(text)
        
#         print(i, text)
        sequence_dict = tokenizer.encode_plus(text, max_length=max_seq_len, pad_to_max_length=True)
        input_ids = sequence_dict['input_ids']
        att_mask = sequence_dict['attention_mask']
#         print(i, tokenizer.tokenize(text))
        input_sequences.append(input_ids)
        attention_masks.append(att_mask)
    
    df['bert_text'] = bert_text
    return input_sequences, attention_masks, df

In [4]:
def preprocess_data(train_df):

    bert_tokenizer = define_tokenizer()
    train_X, train_att, train_df = bert_tokenize(train_df,bert_tokenizer)
    train_y = train_df['target'].values

    # Checking the tokenized format
    # print(train_X[0])
    # print(train_att[0])

    return train_X, train_att,train_y

def create_train_dataloader(train_X, train_att,train_y,batch_size = 64):

    ## Forming the datasets
    train_X = torch.tensor(train_X)
    train_y = torch.tensor(train_y)
    train_att = torch.tensor(train_att)

    train_data = torch.utils.data.TensorDataset(train_X, train_att, train_y)
    train_sampler = torch.utils.data.RandomSampler(train_data)
    train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader


def create_dataloaders(train_idx, val_idx, train_X, train_att,train_y,batch_size = 64):

    ## Forming the datasets
    train_X = torch.tensor(train_X)
    train_y = torch.tensor(train_y)
    train_att = torch.tensor(train_att)

    train_data = torch.utils.data.TensorDataset(train_X, train_att, train_y)

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(val_idx)
    train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler) #train_X ??
    valid_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler)

    # train_sampler = torch.utils.data.RandomSampler(train_data)
    # train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader, valid_dataloader

**2. Adapt and Train Bert model**

- Import pre-trained BERT model for classification
- Resize token embeddings (since we have added two special ones)
- Define train and test functions
- Train model and show metrics with Tensorboard

In [5]:
def define_model(bert_tokenizer, show = False):

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.resize_token_embeddings(len(bert_tokenizer))

    if show :
      # Print model's state_dict
      print("Model's state_dict:")
      for param_tensor in model.state_dict():
          print(param_tensor, "\t", model.state_dict()[param_tensor].size())

    # select device    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    IPython.display.clear_output()
    
    return model, device



In [6]:
from torch.utils.tensorboard import SummaryWriter
model_name = 'network'
log_name = '{}_{}'.format(model_name, datetime.now().strftime('%Y%m%d_%H%M%S'))
writer = SummaryWriter('logs/{}'.format(log_name))

In [7]:
from statistics import mean
from sklearn.metrics import f1_score



def train(BERT_model, epoch, device, train_dataloader):

    # running_accuracy = 0.0
    running_loss = 0.0
    running_f1 = 0.0

    # define model optimizer and loss function

    optimizer = AdamW(BERT_model.parameters(), lr=2e-5, eps=1e-8)
    loss_fct = torch.nn.NLLLoss()

    t0 = datetime.now()
    BERT_model.train()

    for i, batch in enumerate(train_dataloader, start=1):

        # get the inputs : batch is a list of [inputs, att_masks, labels]
        batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch

        # zero the parameter gradients
        BERT_model.zero_grad()  
        
        # forward propagation
        logits = BERT_model(inputs, attention_mask=att_masks)
        outputs = F.log_softmax(logits[0], dim=1)
        
        # compute loss function + backward propagation
        loss = loss_fct(outputs.view(-1, 2), labels.view(-1))
        running_loss += loss.item()
        loss.backward()
        
        # updating current accuracy and f1 scores (after batch i)
        pred_outputs = outputs.detach().cpu().numpy()
        pred = np.argmax(pred_outputs, axis=1)
        labels = labels.cpu().numpy()
        # current_acc = accuracy_score(pred, labels)
        # running_accuracy += current_acc
        current_f1 = f1_score(pred,labels)
        running_f1 += current_f1
        
        # optimize parameters
        torch.nn.utils.clip_grad_norm_(BERT_model.parameters(), 1.0)
        optimizer.step()
        
        if i % 20 == 0: # every 20 batches

            nb_batches = len(train_dataloader)
            nb_samples = len(train_dataloader.dataset)
            batch_size = len(inputs)
            current_batch = epoch*nb_batches + i

            # add the current metrics to the Tensorboard
            writer.add_scalar('training loss', running_loss/20, current_batch)
            # writer.add_scalar('training accuracy', running_accuracy/20, current_batch)
            writer.add_scalar('training f1', running_f1/20, current_batch)

            print('Train Epoch: {} [{}/{} ({:.0%})] - Elapsed: {}  |  Loss: {:.4f}  | F1: {:.4f}  |  Accuracy: {:.4f}'.format(
                epoch, i*batch_size, nb_samples,
                    i / nb_batches, datetime.now() - t0, loss.item(), 
                    current_f1
            ))

            # running_accuracy = 0.0
            running_loss = 0.0
            running_f1 = 0.0


    return BERT_model

In [8]:
def valid(BERT_model, epoch, device, valid_dataloader):

    t0 = datetime.now()
    loss_fct = torch.nn.NLLLoss()
    running_val_loss = 0.0
    running_val_f1 = 0.0
    BERT_model.eval()

    for i, batch in enumerate(valid_dataloader, start=1):
        batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        with torch.no_grad():
            logits = BERT_model(inputs, attention_mask=att_masks)
            outputs = F.log_softmax(logits[0], dim=1)
            
            loss = loss_fct(outputs.view(-1, 2), labels.view(-1))

        running_val_loss += loss.item()
        pref_outputs = outputs.detach().cpu().numpy()
        pred = np.argmax(pref_outputs, axis=1)
        labels = labels.cpu().numpy()
        current_val_f1 = f1_score(pred,labels)
        running_val_f1 += current_val_f1

        if i % 20 == 0: # every 20 batches

            nb_batches = len(valid_dataloader)
            nb_samples = len(valid_dataloader.dataset)
            batch_size = len(inputs)
            current_batch = epoch*nb_batches + i

            # add the current metrics to the Tensorboard
            writer.add_scalar('validation loss', running_val_loss/20, current_batch)
            # writer.add_scalar('training accuracy', running_accuracy/20, current_batch)
            writer.add_scalar('validation f1', running_val_f1/20, current_batch)

            print('Valid set: Loss: {:.4f}  | F1: {:.4f}'.format(
                loss.item(), current_val_f1
            ))

            # running_accuracy = 0.0
            running_val_loss = 0.0
            running_val_f1 = 0.0

    # test_loss /= len(test_dataloader)
    # test_acc /= len(test_dataloader)
    # print('\nTest set: Loss: {:.4f}, Accuracy: {:.1%} - Elapsed: {}\n'.format(
    #     test_loss, test_acc, datetime.now() - t0
    # ))

In [9]:
# from torchsummary import summary

from sklearn.model_selection import KFold

def train_model(BERT_model, device, train_data, k = 10, nb_epoch = 1, save = True, show_summary = True):

    t0 = datetime.now()
    train_X, train_att,train_y = train_data
    splits=KFold(n_splits=k,shuffle=True,random_state=42)


    for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(train_X)))): #train_X.shape[0]
        print(f"FOLD NUMBER {fold}")
        train_dataloader, valid_dataloader = create_dataloaders(train_idx, val_idx, train_X, train_att,train_y)

        for epoch in range(1, nb_epoch+1): # loop over the dataset multiple times
            print(f"##### Training Epoch {epoch} #####")
            trained_model = train(BERT_model, epoch, device, train_dataloader)
            valid(BERT_model, epoch, device, valid_dataloader)

        total_time = round(((datetime.now()-t0).seconds)/60,2) #total training time in minutes
        print(f"Total training time for {nb_epoch} epochs : {total_time} minutes")

    if save:
        torch.save(trained_model.state_dict(), "trained_model.pt")

    return trained_model

    # if show_summary:
    #     summary(model, (3, 224, 224))

In [10]:
def main(train_df):
    print("Defining Tokenizer")
    bert_tokenizer = define_tokenizer()
    print("Defining Model")
    base_model, device = define_model(bert_tokenizer)
    print("Preprocessing dataset")
    train_data = preprocess_data(train_df)
    print("Training model")
    train_model(base_model,device, train_data) #, train_dataloader)


In [11]:
%load_ext tensorboard
%tensorboard --logdir logs

In [12]:
train_df = pd.read_csv("data/train.csv")
main(train_df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


FOLD NUMBER 0
##### Training Epoch 1 #####


**3. Predict on Test data**

- Define predict function
- Generate submission file

In [None]:
def predict(corpus):

    # initialize the BERT model
    bert_tokenizer = define_tokenizer()
    the_model, device = define_model(bert_tokenizer)
    # the_model = define_model()

    #try to load a previsouly trained model or train a new one 
    try:
      print("Trying to load a previously trained model")
      the_model.load_state_dict(torch.load("trained_model"))

    except :
      print("Training a new model")
      the_model = train_model(the_model)
    
    print("Making predictions !")
    predictions = []
    for text in corpus:
        # pre-process text
        input_ = torch.tensor(bert_tokenizer.encode(text)).unsqueeze(0).to(device)
        logits = the_model.eval()(input_ids=input_)[0]
        prob = F.softmax(logits, dim=1)[0]
        pred = np.argmax(prob.cpu().detach().numpy())
        predictions.append(pred)

    return predictions

In [None]:
def make_predictions(test_df):
  """Generate submission file to upload on Kaggle"""
  predictions = predict(test_df.text)
  sample_submission = pd.read_csv("data/sample_submission.csv")
  sample_submission["target"] = predictions
  pd.merge(sample_submission, test_df, on=['id']).sample(frac=1).head(10)
  sample_submission.to_csv("submission.csv", index=False)

In [None]:
test_df = pd.read_csv("data/test.csv")
make_predictions(test_df)