In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Inference on the Test files**

## **Load the Model**
- Load the pre_trained model.
- Class labels

In [2]:
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [4]:
import pprint as pprint
label_to_idx = {'B-Claim': 0, 'I-Claim': 1,
                'B-Evidence': 2, 'I-Evidence': 3,
                'B-Position': 4, 'I-Position': 5,
                'B-Concluding Statement': 6, 'I-Concluding Statement': 7,
                'B-Lead': 8, 'I-Lead': 9,
                'B-Counterclaim': 10, 'I-Counterclaim': 11,
                'B-Rebuttal': 12, 'I-Rebuttal': 13,
                'O': 14}
idx_to_label = {v:k for k,v in label_to_idx.items()}
idx_to_label

{0: 'B-Claim',
 1: 'I-Claim',
 2: 'B-Evidence',
 3: 'I-Evidence',
 4: 'B-Position',
 5: 'I-Position',
 6: 'B-Concluding Statement',
 7: 'I-Concluding Statement',
 8: 'B-Lead',
 9: 'I-Lead',
 10: 'B-Counterclaim',
 11: 'I-Counterclaim',
 12: 'B-Rebuttal',
 13: 'I-Rebuttal',
 14: 'O'}

In [5]:
MAX_LEN = 512
tokenizer = BertTokenizerFast.from_pretrained('../input/feedback-nb3/bert-base-uncased-tokenizer')

In [6]:
model = BertForTokenClassification.from_pretrained('../input/feedback-nb3/feedback-bert-uncased-model1')
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [7]:
def model_pred(content):
    # given a content, get the predictions from the model
    # return encoding and pred_logits
    sentence = content.strip().split()
    encoding = tokenizer(sentence,
                         is_split_into_words=True,
                         return_offsets_mapping=True, 
                         padding='max_length', 
                         truncation=True,
                         max_length=MAX_LEN,
                         return_tensors="pt")

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    tr_logits = outputs[0]
    tr_logits_mod = torch.argmax(tr_logits, axis=2)
    
    # map sub_words to words
    tokens_words = [-1] * MAX_LEN

    w_i = 0
    off = encoding['offset_mapping'][0]
    for itr, off_map in enumerate(off):
        if off_map[0] == 0 and off_map[1] != 0:
            w_i += 1
            tokens_words[itr] = w_i
        elif off_map[0] != 0 and off_map[1] != 0:
            tokens_words[itr] = w_i
            

    return tr_logits_mod, tokens_words

In [8]:
def get_predictions(tr_logits_mod, tokens_words):

    all_predictions = {}
    pred = tr_logits_mod[0]
    i = 0
    while i<MAX_LEN:
        prediction = []
        start = pred[i]
        if start in [0,2,4,6,8,10,12,14]:
            label = idx_to_label[start.item()]
            label_mod = label.replace("B-", "")
            prediction.append(tokens_words[i])
            i += 1
            if i>=MAX_LEN: break
            while pred[i]==start+1:
                # if we have "I-" after "B-" 
                if not tokens_words[i] in prediction:
                    prediction.append(tokens_words[i])
                i += 1
                if i>=MAX_LEN: break
        else:
            i += 1
        prediction = [x for x in prediction if x!=-1]
        if len(prediction)>4:
            if label_mod not in all_predictions:
                all_predictions[label_mod] = [' '.join([str(x) for x in prediction])]
            else:
                all_predictions[label_mod].append(' '.join([str(x) for x in prediction]))
                
    return all_predictions

In [9]:
row_df = [] # list will have the rows for submission_df
folder = "../input/feedback-prize-2021/test"
for file_name in os.listdir(folder):
    if file_name.endswith(".txt"):
        with open(os.path.join(folder, file_name), "r") as fin:
            content = fin.read()
            
    # get the predictions from the model
    tr_logits_mod, tokens_words = model_pred(content)
    
    # convert the predictions into the desired format
    all_predictions = get_predictions(tr_logits_mod, tokens_words)
    
    # store the predicitons in the list
    for label in all_predictions:
        for pred_str in all_predictions[label]:
            ind_dct = {}
            ind_dct['id'] = file_name.replace(".txt", "")
            ind_dct['class'] = label
            ind_dct['predictionstring'] = pred_str
            row_df.append(ind_dct)

In [10]:
sub_df = pd.DataFrame(row_df)
sub_df.head()

Unnamed: 0,id,class,predictionstring
0,0FB0700DAF44,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,0FB0700DAF44,Position,42 43 44 45 46 47 48
2,0FB0700DAF44,Claim,109 110 111 112 113 114 115 116 117 118 119 120
3,0FB0700DAF44,Claim,121 122 123 124 125 126 127 128 129 130 131 13...
4,0FB0700DAF44,Claim,315 316 317 318 319 320 321 322 323 324 325 32...


In [11]:
sub_df.to_csv("submission.csv", index=False)

In [12]:
sub_df.shape

(43, 3)