## BERT Model for token classification

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install transformers -q

[K     |████████████████████████████████| 4.2 MB 4.9 MB/s 
[K     |████████████████████████████████| 596 kB 66.2 MB/s 
[K     |████████████████████████████████| 84 kB 3.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 54.9 MB/s 
[?25h

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
data = pd.read_csv("/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/train_SI_labels_bioe_PNP.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels,BIOE_LABELS,BIOE_LABELS_LEN
0,111111111,Next plague outbreak Madagascar could stronger,NP NP NP NP NP NP,0 5 12 24 35 45,6,6,O O O O O O,6
1,111111111,Geneva World Health Organisation chief Wednesd...,NP NP NP NP NP NP NP NP NP NP P NP NP NP NP NP...,61 74 80 87 100 109 119 126 133 140 149 171 18...,20,20,O O O O O O O O O O B O O O O O O O O O,20
2,111111111,next transmission could pronounced stronger Di...,P P P P P NP NP NP NP NP NP NP NP NP NP NP,269 274 287 301 315 330 339 347 354 362 374 37...,16,16,B I I I E O O O O O O O O O O O,16
3,111111111,outbreak bubonic plague spread infected rats v...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,443 460 468 485 495 504 509 513 518 529 539 54...,22,22,O O O O O O O O O O O O O O O O O O O O O O,22
4,111111111,Madagascar suffered bubonic plague outbreaks a...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,653 668 677 685 692 702 709 715 720 732 738 74...,15,15,O O O O O O O O O O O O O O O,15


In [6]:
labels_to_ids={'O':0,'B':1,'I':2,'E':3}
ids_to_labels ={0:'O',1:'B',2:'I',3:'E'}

In [7]:
data = data[['Text','BIOE_LABELS']]
data.rename(columns = {'Text':'sentence', 'BIOE_LABELS':'word_labels'}, inplace = True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,sentence,word_labels
0,Next plague outbreak Madagascar could stronger,O O O O O O
1,Geneva World Health Organisation chief Wednesd...,O O O O O O O O O O B O O O O O O O O O
2,next transmission could pronounced stronger Di...,B I I I E O O O O O O O O O O O
3,outbreak bubonic plague spread infected rats v...,O O O O O O O O O O O O O O O O O O O O O O
4,Madagascar suffered bubonic plague outbreaks a...,O O O O O O O O O O O O O O O


In [8]:
class dataset(Dataset):
      
    def __init__(self, dataframe, tokenizer, max_len):
            self.len = len(dataframe)
            self.data = dataframe
            self.tokenizer = tokenizer
            self.max_len = max_len

    def __getitem__(self, index):

        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split()

        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True, 
                             is_split_into_words=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels] 
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                # overwrite label
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

    def __len__(self):
        return self.len

In [9]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
MAX_LEN=100
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (18567, 2)
TRAIN Dataset: (14854, 2)
TEST Dataset: (3713, 2)


In [11]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
    print('{0:10}  {1}'.format(token, label))

[CLS]       -100
weeks       0
ford        0
went        0
public      0
ka          0
##vana      -100
##ugh       -100
said        0
dai         0
##ries      -100
summer      0
detailing   0
whereabouts  0
new         0
yorker      0
published   0
ramirez     0
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD

In [12]:
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 2
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [13]:
EPOCHS = 4
LEARNING_RATE = 3e-05
MAX_GRAD_NORM = 1

In [14]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [16]:
count_parameters(model)

108894724

In [17]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [18]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(model,training_loader):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss, tr_logits = outputs[:2]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
       
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
  
        active_accuracy = labels.view(-1) != -100 
        
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    return epoch_loss,tr_accuracy

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = outputs[:2]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return eval_loss,eval_accuracy,labels, predictions

In [19]:
import time
from tqdm import tqdm

best_valid_loss = float('inf')
tot_t_loss, tot_v_loss =[],[]

for epoch in tqdm(range(EPOCHS)):
    print(f"Training epoch: {epoch + 1}")
    tr_loss,tr_acc = train(model,training_loader)
    tot_t_loss.append(tr_loss)
  
    val_loss,val_acc,lab,pred = valid(model,training_loader)
    tot_v_loss.append(val_loss)

    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        best_pred, best_tgt = pred, lab
        torch.save(model.state_dict(), 'model_least_loss.pt')
        print("\nBest Model Saved !!")


  0%|          | 0/4 [00:00<?, ?it/s]

Training epoch: 1
Training loss per 100 training steps: 1.7322723865509033
Training loss per 100 training steps: 0.5903826715922592
Training loss per 100 training steps: 0.5454123844554768
Training loss per 100 training steps: 0.5283894411155156
Training loss per 100 training steps: 0.4982061693235823
Training loss per 100 training steps: 0.48927266836285355
Training loss per 100 training steps: 0.4783386262924025
Training loss per 100 training steps: 0.4704015492733977
Training loss per 100 training steps: 0.4603166669495543
Training loss per 100 training steps: 0.45164103215595464
Training loss per 100 training steps: 0.44676138226921625
Training loss per 100 training steps: 0.44603362756964837
Training loss per 100 training steps: 0.4438009641282912
Training loss per 100 training steps: 0.44238541043613344
Training loss per 100 training steps: 0.4383783196175549
Training loss epoch: 0.4372823708872455
Training accuracy epoch: 0.8613771558873634
Validation loss per 100 evaluation ste

 25%|██▌       | 1/4 [06:47<20:23, 407.75s/it]


Best Model Saved !!
Training epoch: 2
Training loss per 100 training steps: 0.347013384103775
Training loss per 100 training steps: 0.3275090201693301
Training loss per 100 training steps: 0.3331622930234344
Training loss per 100 training steps: 0.33787147253999283
Training loss per 100 training steps: 0.33908755991821576
Training loss per 100 training steps: 0.33627297663075956
Training loss per 100 training steps: 0.3399132711753968
Training loss per 100 training steps: 0.3385573400878022
Training loss per 100 training steps: 0.3380909941197558
Training loss per 100 training steps: 0.33735152636422105
Training loss per 100 training steps: 0.33488793973322517
Training loss per 100 training steps: 0.33356522141594597
Training loss per 100 training steps: 0.3312512896768904
Training loss per 100 training steps: 0.33103670790608253
Training loss per 100 training steps: 0.3297820247092858
Training loss epoch: 0.3298362520865565
Training accuracy epoch: 0.877101923495139
Validation loss p

 50%|█████     | 2/4 [13:42<13:44, 412.07s/it]


Best Model Saved !!
Training epoch: 3
Training loss per 100 training steps: 0.1566944420337677
Training loss per 100 training steps: 0.19372164446971205
Training loss per 100 training steps: 0.1916151647432823
Training loss per 100 training steps: 0.19824805961878494
Training loss per 100 training steps: 0.2063998081407828
Training loss per 100 training steps: 0.20259466366898157
Training loss per 100 training steps: 0.19817341038160732
Training loss per 100 training steps: 0.20250470834530454
Training loss per 100 training steps: 0.20291500664624232
Training loss per 100 training steps: 0.2042209602162122
Training loss per 100 training steps: 0.20439688554875643
Training loss per 100 training steps: 0.20541985389957548
Training loss per 100 training steps: 0.20685348517670887
Training loss per 100 training steps: 0.20586636790889046
Training loss per 100 training steps: 0.20785650636034908
Training loss epoch: 0.20624696115777022
Training accuracy epoch: 0.9238643033895038
Validation

 75%|███████▌  | 3/4 [20:37<06:53, 413.28s/it]


Best Model Saved !!
Training epoch: 4
Training loss per 100 training steps: 0.08992920815944672
Training loss per 100 training steps: 0.10771080309820205
Training loss per 100 training steps: 0.1174458674762157
Training loss per 100 training steps: 0.10902382545343343
Training loss per 100 training steps: 0.11158711828716433
Training loss per 100 training steps: 0.1073307871150258
Training loss per 100 training steps: 0.10719211019084365
Training loss per 100 training steps: 0.10780607601833635
Training loss per 100 training steps: 0.10958905634194142
Training loss per 100 training steps: 0.11113922803088522
Training loss per 100 training steps: 0.11248353692774124
Training loss per 100 training steps: 0.11117532565125215
Training loss per 100 training steps: 0.11215249657815866
Training loss per 100 training steps: 0.11322001981945189
Training loss per 100 training steps: 0.1134504855263594
Training loss epoch: 0.11348385264018104
Training accuracy epoch: 0.9609681604943088
Validatio

100%|██████████| 4/4 [27:35<00:00, 413.99s/it]


Best Model Saved !!





## Testing by loading best model

In [20]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [21]:
model.load_state_dict(torch.load("model_least_loss.pt"))
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [22]:
def predict(sentence):
    sentence = sentence

    inputs = tokenizer(sentence.split(),
                      is_split_into_words=True,  
                      return_offsets_mapping=True, 
                      padding='max_length', 
                      truncation=True, 
                      max_length=MAX_LEN,
                      return_tensors="pt")

  # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
  # forward pass
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) 
    flattened_predictions = torch.argmax(active_logits, axis=1) 

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))

    prediction = []
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
        if mapping[0] == 0 and mapping[1] != 0:
            prediction.append(token_pred[1])
        else:
            continue

    # print(sentence.split())
    # print(prediction)
    return prediction

In [23]:
data_dev= pd.read_csv(r'/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/dev_SI_labels_bioe_PNP.csv')
data_test=pd.read_csv(r'/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/test_SI_data.csv')

In [24]:
data_dev.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels,BIOE_LABELS,BIOE_LABELS_LEN
0,730081389,Police previously gone home Ohio patrol office...,NP NP NP NP NP NP NP NP,0 11 22 30 41 46 53 67,8,8,O O O O O O O O,8
1,730081389,CLEVELAND,NP,75,1,1,O,1
2,730081389,Police invstigating domestic disputes previous...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,87 94 107 116 129 140 152 165 169 177 182 186 ...,23,23,O O O O O O O O O O O O O O O O O O O O O O O,23
3,730081389,Westerville Officers Eric Joering Anthony More...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,312 324 333 338 355 363 381 388 402 407 424 43...,16,16,O O O O O O O O O O O O O O O O,16
4,730081389,suspect old Quentin Smith shot wounded officer...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,490 507 511 519 530 539 554 567 576 581 587 59...,17,17,O O O O O O O O O O O O O O O O O,17


In [25]:
data_test.head()

Unnamed: 0,Id,Text,tok_idx
0,813452859,EU Profits Trading UK,0 3 16 29
1,813452859,London Loses Money Political Campaigner,38 45 51 59 69
2,813452859,Parliamentary vote British Prime Minister Ther...,90 104 112 120 126 135 143 149 156 161 171 176...
3,813452859,chance May deal make parliament fails could on...,332 344 350 360 376 397 408 419 427 437 446 45...
4,813452859,Sputnik spoke political campaigner Michael Swa...,470 478 489 499 510 518


In [26]:
sentences_dev = data_dev['Text'].values
sentences_test = data_test['Text'].values

In [27]:
predictions_dev=[]
for sentence in sentences_dev:
    pred=predict(sentence)
    predictions_dev.append(pred)

In [28]:
predictions_test=[]
for sentence in sentences_test:
    pred=predict(sentence)
    predictions_test.append(pred)

In [29]:
print(len(predictions_test))
print(len(sentences_test))

print(len(predictions_dev))
print(len(sentences_dev))

3455
3455
3589
3589


In [30]:
data_dev["pred_labels"]=predictions_dev
data_test["pred_labels"]=predictions_test

In [31]:
data_dev.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels,BIOE_LABELS,BIOE_LABELS_LEN,pred_labels
0,730081389,Police previously gone home Ohio patrol office...,NP NP NP NP NP NP NP NP,0 11 22 30 41 46 53 67,8,8,O O O O O O O O,8,"[O, O, O, O, O, O, O, O]"
1,730081389,CLEVELAND,NP,75,1,1,O,1,[O]
2,730081389,Police invstigating domestic disputes previous...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,87 94 107 116 129 140 152 165 169 177 182 186 ...,23,23,O O O O O O O O O O O O O O O O O O O O O O O,23,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,730081389,Westerville Officers Eric Joering Anthony More...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,312 324 333 338 355 363 381 388 402 407 424 43...,16,16,O O O O O O O O O O O O O O O O,16,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,730081389,suspect old Quentin Smith shot wounded officer...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,490 507 511 519 530 539 554 567 576 581 587 59...,17,17,O O O O O O O O O O O O O O O O O,17,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [32]:
data_test.head()

Unnamed: 0,Id,Text,tok_idx,pred_labels
0,813452859,EU Profits Trading UK,0 3 16 29,"[O, O, O, O]"
1,813452859,London Loses Money Political Campaigner,38 45 51 59 69,"[O, O, O, O, O]"
2,813452859,Parliamentary vote British Prime Minister Ther...,90 104 112 120 126 135 143 149 156 161 171 176...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,813452859,chance May deal make parliament fails could on...,332 344 350 360 376 397 408 419 427 437 446 45...,"[B, I, I, I, I, I, I, I, I, E, I, I, E]"
4,813452859,Sputnik spoke political campaigner Michael Swa...,470 478 489 499 510 518,"[O, O, O, O, O, O]"


In [33]:
data_dev.to_csv(r'/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/data_dev_SI_results.csv',index=False)

In [34]:
data_test.to_csv(r'/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/data_test_SI_results.csv',index=False)

## Post Processing

In [35]:
result_dev = pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/data_dev_SI_results.csv')
result_test = pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/data_test_SI_results.csv')

In [36]:
def span_formation(df):
    pred_labels = df['pred_labels'].values
    
    pred_labels_sent=[]
    for value in pred_labels:
        s=value
        s=s.replace("[","").replace("]","").replace("\'","").replace(",","")
        pred_labels_sent.append(s)
    
    df['pred_labels_sent']=pred_labels_sent
    
    ids = df['Id'].values
    pred = df['pred_labels_sent'].values
    tok_idx = df['tok_idx'].values
    text = df['Text'].values
    id_arr=[]
    span_arr_start=[]
    span_arr_end=[]
    for i in range(len(ids)):
        id_i = ids[i]
        text_i = text[i].split(" ")
        pred_i = pred[i].split(" ")
        tok_i = tok_idx[i].split(" ")
        b=[]
        e=[]
        i_t=[]
        index_arr=[]
        start=-1
        end=-1
        for j in range(len(pred_i)):
            if pred_i[j]=='B' or pred_i[j]=='E' or pred_i[j]=='I':
                s=int(tok_i[j])
                e=len(text_i[j])
                index_arr.append((s,e))

        if len(index_arr)!=0:
            start=index_arr[0][0]
            end = index_arr[0][0]+index_arr[0][1]
            for k in range(1,len(index_arr)):
                if index_arr[k][0]-end<(12*5):
       
                    end=index_arr[k][0]+index_arr[k][1]
             
                else:
                 
                    span_arr_start.append(start)
                    span_arr_end.append(end)
         
                    id_arr.append(id_i)
                    start=index_arr[k][0]
                    end = index_arr[k][0]+index_arr[k][1]
            span_arr_start.append(start)
            span_arr_end.append(end)
            id_arr.append(id_i)
   
            
    return id_arr,span_arr_start,span_arr_end

In [37]:
id_arr,span_arr_start,span_arr_end = span_formation(result_dev)
dict1 = {'Id': id_arr, 'Start': span_arr_start, 'End':span_arr_end}     
df = pd.DataFrame(dict1)
df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Result-SI/Result_dev_SI_Final.tsv',index=False,header=False,sep="\t")

In [38]:
id_arr,span_arr_start,span_arr_end = span_formation(result_test)
dict1 = {'Id': id_arr, 'Start': span_arr_start, 'End':span_arr_end}     
df = pd.DataFrame(dict1)
df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Result-SI/Result_test_SI_Final.tsv',index=False,header=False,sep="\t")

In [None]:
## Download the two tsv files and run the scorer