Import Dependencies

In [None]:
# !pip install transformers datasets sentencepiece seqeval
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


Import Data

In [None]:
data = pd.read_excel("/content/drive/MyDrive/data/Raw_data.xlsx")
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,1)पालिकेचे,O
1,,नाव,O
2,,मुंबई,BLOC
3,,मनपाइतर,O
4,,वर्णन,O


In [None]:
data.Tag.unique()

array(['O', 'BLOC', 'BUNITNO', 'BFLOOR', 'IFLOOR', 'BNAME', 'INAME',
       'ILOC', 'BCTS', 'ICTS', 'IUNITNO'], dtype=object)

In [None]:
data.Tag.dtype

dtype('O')

In [None]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 11


O          5805
INAME       241
BLOC        221
ILOC        194
BCTS        137
ICTS        129
BNAME       114
BFLOOR      109
BUNITNO     107
IFLOOR       53
IUNITNO       1
Name: Tag, dtype: int64

In [None]:
data.head(50)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,1)पालिकेचे,O
1,,नाव,O
2,,मुंबई,BLOC
3,,मनपाइतर,O
4,,वर्णन,O
5,,,O
6,,सदनिका,O
7,,नं,O
8,,,O
9,,शॉप,O


In [None]:
data.dropna(subset=['Word'], how='all', inplace=True)
data

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,1)पालिकेचे,O
1,,नाव,O
2,,मुंबई,BLOC
3,,मनपाइतर,O
4,,वर्णन,O
...,...,...,...
7102,,Number,O
7105,,777,BCTS
7106,,;,O
7108,,),O


In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,1)पालिकेचे,O
1,Sentence: 1,नाव,O
2,Sentence: 1,मुंबई,BLOC
3,Sentence: 1,मनपाइतर,O
4,Sentence: 1,वर्णन,O


In [None]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,Tag,sentence,word_labels
0,Sentence: 1,1)पालिकेचे,O,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."
1,Sentence: 1,नाव,O,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."
2,Sentence: 1,मुंबई,BLOC,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."
3,Sentence: 1,मनपाइतर,O,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."
4,Sentence: 1,वर्णन,O,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."


In [None]:
label2id = {k: v for v, k in enumerate(data.Tag.unique())}
id2label = {v: k for v, k in enumerate(data.Tag.unique())}
label2id

{'O': 0,
 'BLOC': 1,
 'BUNITNO': 2,
 'BFLOOR': 3,
 'IFLOOR': 4,
 'BNAME': 5,
 'INAME': 6,
 'ILOC': 7,
 'BCTS': 8,
 'ICTS': 9,
 'IUNITNO': 10}

In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."
1,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,ILOC,O,O,O,O,O,BUNITNO,O,O,BFLOOR,O,O,BNAM..."
2,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR,O..."
3,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं श...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."
4,1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं ऑ...,"O,O,BLOC,O,O,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR..."


In [None]:
len(data)

108

In [None]:
data.iloc[41].sentence

'1)पालिकेचे नाव मुंबई मनपाइतर वर्णन सदनिका नं 701 माळा नं 7 वा मजला इमारतीचे नाव अंबर पार्क को ऑप हौ सो लि ब्लॉक नं अंधेरी पश्चिम मुंबई 400058 रोड नं सी डी बर्फीवाला मार्ग इतर माहिती व सोबत 4 स्टील्ट मेकेनिकल कार पार्किंग( ( C.T.S. Number 263 D ; ) )'

In [None]:
data.iloc[41].word_labels

'O,O,BLOC,O,O,O,O,BUNITNO,O,O,BFLOOR,IFLOOR,IFLOOR,O,O,BNAME,INAME,INAME,INAME,INAME,INAME,INAME,O,O,BLOC,ILOC,ILOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,BCTS,ICTS,O,O,O'

In [None]:
MAX_LEN = 500
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 20
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer
from seqeval.metrics import f1_score

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (108, 2)
TRAIN Dataset: (86, 2)
TEST Dataset: (22, 2)


In [None]:
training_set[0]

{'ids': tensor([     2,    151,    129,  29001,    507,    232,   6151,   2092,   1793,
           1038,   1520,   1551,   6566,   9327,      8, 178192,    339,   2278,
          10225,    229,      8, 171320, 168046,  88438,   2863,  25716,   8430,
            494,   4157, 108632,    236,     30,    494,  15774,   4086,   4769,
           1729,    655,  20272,   2543,   2700, 104943,   2092,   1793,   1038,
             13, 138179,  14095,   1301,   5244,   2828,    236,     30, 191999,
            130, 122680,     13,   4874,  20272,  32571,    507,     13,   4874,
          20272,  64556,  46474,    232,  42418,    367,     13,   2977,  13133,
           3779,  37758,    367,  94163,   1325,  77084,  14463,  57678,  77084,
           5034, 166474,   2155,  41670,    741,  79615,   2805,   1279,    487,
            487,  57972,    507,   6448,  76214,      8, 169426,  10368,  13223,
          40462,   1732,     20,   6364,      9,    580,      9,     32,      9,
           2140,  712

In [None]:
training_set[0]["ids"]

tensor([     2,    151,    129,  29001,    507,    232,   6151,   2092,   1793,
          1038,   1520,   1551,   6566,   9327,      8, 178192,    339,   2278,
         10225,    229,      8, 171320, 168046,  88438,   2863,  25716,   8430,
           494,   4157, 108632,    236,     30,    494,  15774,   4086,   4769,
          1729,    655,  20272,   2543,   2700, 104943,   2092,   1793,   1038,
            13, 138179,  14095,   1301,   5244,   2828,    236,     30, 191999,
           130, 122680,     13,   4874,  20272,  32571,    507,     13,   4874,
         20272,  64556,  46474,    232,  42418,    367,     13,   2977,  13133,
          3779,  37758,    367,  94163,   1325,  77084,  14463,  57678,  77084,
          5034, 166474,   2155,  41670,    741,  79615,   2805,   1279,    487,
           487,  57972,    507,   6448,  76214,      8, 169426,  10368,  13223,
         40462,   1732,     20,   6364,      9,    580,      9,     32,      9,
          2140,  71221,     87,     28, 

In [None]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
▁1          O
)           O
पल          O
क           O
च           O
▁नव         O
▁म          BLOC
ब           BLOC
ई           BLOC
▁मन         O
प           O
इ           O
तर          O
▁           O
वरण         O
न           O
▁इतर        O
▁मह         O
त           O
▁           BNAME
एमआय        BNAME
डस          BNAME
▁सट         INAME
फ           INAME
▁कव         INAME
टर          INAME
स           INAME
▁बल         INAME
डग          INAME


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
from transformers import BertTokenizer, BertForTokenClassification
model = BertForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)

You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['encoder.layer.5.attention.self.query.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.11.attention.self.query.bias', 'encoder.layer.6.attention.self.value.weight', 'encoder.layer.11.attention.self.key.bias', 'encoder.layer.6.intermediate.dense.weight', 'embeddings.LayerNorm.weight', 'classifier.bias', 'encoder.layer.6.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.6.attention.self.key.weight', 'encoder.layer.8.attention.self.query.weight', 'encoder.layer.8.attention.self.key.weight', 'encoder.layer.9.attention.self.query.weight', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.1.attention.output.LayerNorm.bias', 'encoder.layer.7.

In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.3841, grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 11])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 0.040197938680648804
Training loss epoch: 0.04565109210935506
Training accuracy epoch: 0.97632473459294
Training epoch: 2
Training loss per 100 training steps: 0.06593344360589981
Training loss epoch: 0.042588056149807846
Training accuracy epoch: 0.9796917049988743
Training epoch: 3
Training loss per 100 training steps: 0.03274357691407204
Training loss epoch: 0.03910218949683688
Training accuracy epoch: 0.9807924354868042
Training epoch: 4
Training loss per 100 training steps: 0.034359510987997055
Training loss epoch: 0.03702806444330649
Training accuracy epoch: 0.9815705093637075
Training epoch: 5
Training loss per 100 training steps: 0.04204487428069115
Training loss epoch: 0.039146585474637424
Training accuracy epoch: 0.9793076722446554
Training epoch: 6
Training loss per 100 training steps: 0.02521318942308426
Training loss epoch: 0.03602663402191617
Training accuracy epoch: 0.9804346920123799
Training epoch: 7
Training loss 

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.39221614599227905
Validation Loss: 0.36037581346251746
Validation Accuracy: 0.9010104066551361


In [None]:

from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         CTS       0.43      0.50      0.46        46
       FLOOR       0.48      0.56      0.52        25
         LOC       0.74      0.84      0.79       119
        NAME       0.45      0.59      0.51        56
      UNITNO       0.86      0.75      0.80        48

   micro avg       0.62      0.70      0.66       294
   macro avg       0.59      0.65      0.62       294
weighted avg       0.63      0.70      0.66       294





In [None]:
sentence = "1)पालिकेचे नाव:मुंबई म.न.पा.इतर वर्णन :सदनिका नं: 507, माळा नं: 5,बील्डींग नं.3, इमारतीचे नाव: त्रिमूर्ति सीएचएस लिमिटेड,एमएमआरडीए कॉलनी,, ब्लॉक नं: स्टेशन रोड कांजुर मार्ग् पश्चिम, रोड नं: मुंबई-400078(  (     C.T.S. Number : 586,587,591 ;  )  )"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

▁1 ) पल क च ▁नव : म ब ई ▁म . न . प . इ तर ▁ वरण न ▁: सदन क ▁न : ▁50 7 , ▁मळ ▁न : ▁5 , बल डग ▁न .3 , ▁इमर तच ▁नव : ▁तर मर त ▁स एचएस ▁ल मट ड , एमएम आर ड ए ▁कल न , , ▁ बलक ▁न : ▁सट शन ▁रड ▁क जर ▁मर ग ▁प श चम , ▁रड ▁न : ▁म ब ई - 4000 78 ( ▁( ▁c . t . s . ▁number ▁: ▁58 6,5 8 7,5 91 ▁; ▁) ▁) <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [None]:
print(len(str_rep))
print(len(word_level_predictions))

2663
498


In [None]:
result = zip(str_rep, word_level_predictions)
print(result)

<zip object at 0x78c11012b3c0>


In [None]:
sample_results = pd.DataFrame(result,
                                columns=['words','entity'])

sample_results

Unnamed: 0,words,entity
0,▁,O
1,1,O
2,,O
3,),O
4,,O
...,...,...
493,p,O
494,a,O
495,d,O
496,>,O


In [None]:
# sample_results.to_csv('/content/drive/MyDrive/data/NER_sample_output.csv')