In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Training Notebook**
1. Run this notebook - to create a trained model.
2. Later used this trained model for inference and create a submission.csv file

In [2]:
label_df_new=pd.read_csv("../input/feedback-nb1/train_file_labels2.csv")
label_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15594 entries, 0 to 15593
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         15594 non-null  object
 1   text       15594 non-null  object
 2   label_str  15594 non-null  object
dtypes: object(3)
memory usage: 365.6+ KB


In [3]:
# let's find out the number of unique tags and their count
from collections import Counter

labels_lst = []
for label_str in label_df_new["label_str"]:
    labels_lst.extend(label_str.strip().split(","))
labels_count = Counter(labels_lst)
labels_count = {k: v for k, v in sorted(labels_count.items(), key=lambda item: item[1], reverse=True)}
print(len(labels_count))
labels_count

15


{'I-Evidence': 3489544,
 'I-Claim': 824107,
 'I-Concluding Statement': 814366,
 'I-Lead': 474070,
 'O': 305458,
 'I-Position': 265856,
 'I-Counterclaim': 133965,
 'I-Rebuttal': 117445,
 'B-Claim': 50202,
 'B-Evidence': 45702,
 'B-Position': 15419,
 'B-Concluding Statement': 13505,
 'B-Lead': 9305,
 'B-Counterclaim': 5817,
 'B-Rebuttal': 4337}

In [4]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     |████████████████████████████████| 43 kB 247 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=490034c403fccfef3a9badae33ad5c57cd659daf1875fd80f84bb84146e92345
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


## **Create the dataset for the transformer model**

In [5]:
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [6]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [7]:
labels_count

{'I-Evidence': 3489544,
 'I-Claim': 824107,
 'I-Concluding Statement': 814366,
 'I-Lead': 474070,
 'O': 305458,
 'I-Position': 265856,
 'I-Counterclaim': 133965,
 'I-Rebuttal': 117445,
 'B-Claim': 50202,
 'B-Evidence': 45702,
 'B-Position': 15419,
 'B-Concluding Statement': 13505,
 'B-Lead': 9305,
 'B-Counterclaim': 5817,
 'B-Rebuttal': 4337}

In [8]:
import pprint as pprint
label_to_idx = {'B-Claim': 0, 'I-Claim': 1,
                'B-Evidence': 2, 'I-Evidence': 3,
                'B-Position': 4, 'I-Position': 5,
                'B-Concluding Statement': 6, 'I-Concluding Statement': 7,
                'B-Lead': 8, 'I-Lead': 9,
                'B-Counterclaim': 10, 'I-Counterclaim': 11,
                'B-Rebuttal': 12, 'I-Rebuttal': 13,
                'O': 14}
idx_to_label = {v:k for k,v in label_to_idx.items()}
idx_to_label

{0: 'B-Claim',
 1: 'I-Claim',
 2: 'B-Evidence',
 3: 'I-Evidence',
 4: 'B-Position',
 5: 'I-Position',
 6: 'B-Concluding Statement',
 7: 'I-Concluding Statement',
 8: 'B-Lead',
 9: 'I-Lead',
 10: 'B-Counterclaim',
 11: 'I-Counterclaim',
 12: 'B-Rebuttal',
 13: 'I-Rebuttal',
 14: 'O'}

In [9]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 40
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
tokenizer.save_pretrained("./bert-base-uncased-tokenizer")

('./bert-base-uncased-tokenizer/tokenizer_config.json',
 './bert-base-uncased-tokenizer/special_tokens_map.json',
 './bert-base-uncased-tokenizer/vocab.txt',
 './bert-base-uncased-tokenizer/added_tokens.json',
 './bert-base-uncased-tokenizer/tokenizer.json')

In [11]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.text[index].strip().split()  
        word_labels = self.data.label_str[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                            is_split_into_words=True,
#                              is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [label_to_idx[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [12]:
train_size = 0.8
train_dataset = label_df_new.sample(frac=train_size,random_state=200)
test_dataset = label_df_new.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(label_df_new.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (15594, 3)
TRAIN Dataset: (12475, 3)
TEST Dataset: (3119, 3)


In [13]:
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_idx))
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## **Training**

In [16]:
## sanity check
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss 

tensor(2.6993, device='cuda:0', grad_fn=<NllLossBackward>)

In [17]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [18]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = output[0]
        tr_logits = output[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        # if flattened_target is [1024,] i.e [2*512,] where 2 is batch_size and 512 is seq_length
        # and out of 1024 i.e 50 tokens have -100 i.e they don't have a label assigned then
        # labels is [974,] and predictions is [974,]
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [19]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.753129005432129
Training loss per 100 training steps: 1.578703756969754
Training loss per 100 training steps: 1.4377106828476065
Training loss per 100 training steps: 1.369864355488077
Training loss per 100 training steps: 1.3116734482700985
Training loss per 100 training steps: 1.2644969548531872
Training loss per 100 training steps: 1.2401785113648844
Training loss per 100 training steps: 1.2196291163032302
Training loss per 100 training steps: 1.2030350683482547
Training loss per 100 training steps: 1.1826867650404622
Training loss per 100 training steps: 1.166116451258426
Training loss per 100 training steps: 1.147037460547377
Training loss per 100 training steps: 1.1289966313120328
Training loss per 100 training steps: 1.1108486611370303
Training loss per 100 training steps: 1.0966670158758578
Training loss per 100 training steps: 1.0878434659916905
Training loss per 100 training steps: 1.0786816136193678
Training loss per 

In [20]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            output = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = output[0]
            eval_logits = output[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
#             eval_labels.append(labels)
#             eval_preds.append(predictions)
            
            eval_labels.append([idx_to_label[id.item()] for id in labels])
            eval_preds.append([idx_to_label[id.item()] for id in predictions])
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

#     labels = [idx_to_label[id.item()] for id in eval_labels]
#     predictions = [idx_to_label[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

#     return labels, predictions
    return eval_labels, eval_preds

In [21]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 2.691830635070801
Validation loss per 100 evaluation steps: 1.6960436483420949
Validation loss per 100 evaluation steps: 1.7317183865836603
Validation loss per 100 evaluation steps: 1.733861021722274
Validation loss per 100 evaluation steps: 1.7127212099823867
Validation loss per 100 evaluation steps: 1.7073517568751486
Validation loss per 100 evaluation steps: 1.721878132350532
Validation loss per 100 evaluation steps: 1.7190289160664514
Validation loss per 100 evaluation steps: 1.709740483768275
Validation loss per 100 evaluation steps: 1.7086721779768264
Validation loss per 100 evaluation steps: 1.706198286841978
Validation loss per 100 evaluation steps: 1.6989965771513567
Validation loss per 100 evaluation steps: 1.6963462932368774
Validation loss per 100 evaluation steps: 1.6976503893277743
Validation loss per 100 evaluation steps: 1.6957979453614596
Validation loss per 100 evaluation steps: 1.6899496261018025
Validation Loss: 1.6772418710

In [22]:
from seqeval.metrics import classification_report

print(classification_report(labels, predictions))

                      precision    recall  f1-score   support

               Claim       0.28      0.43      0.34      9571
Concluding Statement       0.30      0.49      0.37      1905
        Counterclaim       0.17      0.25      0.21       984
            Evidence       0.16      0.25      0.20      8532
                Lead       0.41      0.56      0.47      1836
            Position       0.35      0.50      0.41      3028
            Rebuttal       0.13      0.19      0.15       677

           micro avg       0.25      0.38      0.30     26533
           macro avg       0.26      0.38      0.31     26533
        weighted avg       0.25      0.38      0.30     26533



In [23]:
import os
dirname = "/kaggle/working/feedback-bert-uncased-model1/"
if not os.path.isdir(dirname):
    os.makedirs(dirname)
# os.rename(dirname, "/kaggle/working/bert-base-uncased-model1/")

model.save_pretrained(dirname)