# HuggingFace Inference Baseline

Training notebook: https://www.kaggle.com/thedrcat/feedback-prize-huggingface-baseline-training

In [17]:
import os 
from pathlib import Path

import pandas as pd
from collections import defaultdict

import torch
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset

In [7]:
# Config
batch_size = 1
min_tokens = 5
tok_checkpoint = '../input/longformer/model'
model_checkpoint = '../input/feedback-prize-huggingface-baseline-training/longformer-base-4096-4/pytorch_model.bin'

In [2]:
# Load data

train = pd.read_csv('../input/feedback-prize-2021/train.csv')
train.head(1)

test = pd.read_csv('../input/feedback-prize-2021/sample_submission.csv')
test.head(1)

Unnamed: 0,id,class,predictionstring
0,18409261F5C2,,


In [3]:
# Setup dictionaries
classes = train.discourse_type.unique().tolist()

tags = defaultdict()
for i, c in enumerate(classes):
    tags[f'B-{c}'] = i
    tags[f'I-{c}'] = i + len(classes)
tags[f'O'] = len(classes) * 2
tags[f'Special'] = -100
l2i = dict(tags)

i2l = defaultdict()
for k, v in l2i.items(): 
    i2l[v] = k
i2l[-100] = 'Special'
i2l = dict(i2l)

In [18]:
# Helper functions

test_path = Path('../input/feedback-prize-huggingface-baseline-training/test')

def get_test_text(ids):
    with open(test_path/f'{ids}.txt', 'r') as file: data = file.read()
    return data

In [5]:
# Tokenizer
    
tokenizer = AutoTokenizer.from_pretrained(tok_checkpoint, add_prefix_space=True)

In [8]:
# Load model


model = AutoModelForTokenClassification.from_pretrained(tok_checkpoint, num_labels=len(i2l)-1)

model.load_state_dict(torch.load(model_checkpoint))
model.eval();

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [13]:
# We'll use trainer with the loaded model to run inference on test set
trainer = Trainer(
    model,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [14]:
# code that will convert our predictions into prediction strings. we'll skip visualization here. 
# this most likely requires some refactoring

def get_class(c):
    if c == 14: return 'Other'
    else: return i2l[c][2:]

def pred2span(pred, example, viz=False, test=False):
    example_id = example['id']
    n_tokens = len(example['input_ids'])
    classes = []
    all_span = []
    for i, c in enumerate(pred.tolist()):
        if i == n_tokens-1:
            break
        if i == 0:
            cur_span = example['offset_mapping'][i]
            classes.append(get_class(c))
        elif i > 0 and (c == pred[i-1] or (c-7) == pred[i-1]):
            cur_span[1] = example['offset_mapping'][i][1]
        else:
            all_span.append(cur_span)
            cur_span = example['offset_mapping'][i]
            classes.append(get_class(c))
    all_span.append(cur_span)
    
    if test: text = get_test_text(example_id)
    else: text = get_raw_text(example_id)
        
    # map token ids to word (whitespace) token ids
    predstrings = []
    for span in all_span:
        span_start = span[0]
        span_end = span[1]
        before = text[:span_start]
        token_start = len(before.split())
        if len(before) == 0: token_start = 0
        elif before[-1] != ' ': token_start -= 1
        num_tkns = len(text[span_start:span_end+1].split())
        tkns = [str(x) for x in range(token_start, token_start+num_tkns)]
        predstring = ' '.join(tkns)
        predstrings.append(predstring)
                    
    rows = []
    for c, span, predstring in zip(classes, all_span, predstrings):
        e = {
            'id': example_id,
            'discourse_type': c,
            'predictionstring': predstring,
            'discourse_start': span[0],
            'discourse_end': span[1],
            'discourse': text[span[0]:span[1]+1]
        }
        rows.append(e)


    df = pd.DataFrame(rows)
    df['length'] = df['discourse'].apply(lambda t: len(t.split()))
    
    # short spans are likely to be false positives, we can choose a min number of tokens based on validation
    df = df[df.length > min_tokens].reset_index(drop=True)

    return df

In [20]:
# Load test data

files = os.listdir('../input/feedback-prize-huggingface-baseline-training/test')
ids = [x.split('.')[0] for x in files]

df_test = pd.DataFrame()
df_test['id'] = ids
df_test['text'] = df_test['id'].apply(get_test_text)
df_test

Unnamed: 0,id,text
0,0FB0700DAF44,"During a group project, have you ever asked a ..."
1,18409261F5C2,80% of Americans believe seeking multiple opin...
2,D46BCB48440A,"When people ask for advice,they sometimes talk..."
3,D72CB1C11673,Making choices in life can be very difficult. ...
4,DF920E0A7337,Have you ever asked more than one person for h...


In [21]:
test_ds = Dataset.from_pandas(df_test)
test_ds

Dataset({
    features: ['id', 'text'],
    num_rows: 5
})

In [22]:
def tokenize_for_test(examples):

    o = tokenizer(examples['text'], truncation=True, return_offsets_mapping=True, max_length=4096)
  
    return o

In [23]:
tokenized_test = test_ds.map(tokenize_for_test)
tokenized_test

0ex [00:00, ?ex/s]

Dataset({
    features: ['id', 'text', 'input_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 5
})

In [24]:
predictions, _, _ = trainer.predict(tokenized_test)

The following columns in the test set  don't have a corresponding argument in `LongformerForTokenClassification.forward` and have been ignored: offset_mapping, id, text.
***** Running Prediction *****
  Num examples = 5
  Batch size = 8
Input ids are automatically padded from 1304 to 1536 to be a multiple of `config.attention_window`: 512


In [25]:
import numpy as np

preds = np.argmax(predictions, axis=-1)
predictions.shape, preds.shape

((5, 1304, 15), (5, 1304))

In [26]:
dfs = []
for i in range(len(tokenized_test)):
    dfs.append(pred2span(preds[i], tokenized_test[i], test=True))

pred_df = pd.concat(dfs, axis=0)
pred_df['class'] = pred_df['discourse_type']

In [27]:
sub = pred_df[['id', 'class', 'predictionstring']]

In [28]:
sub.to_csv('submission.csv', index=False)

In [29]:
sub

Unnamed: 0,id,class,predictionstring
0,0FB0700DAF44,Lead,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
1,0FB0700DAF44,Position,41 42 43 44 45 46 47
2,0FB0700DAF44,Claim,49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
3,0FB0700DAF44,Claim,66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 8...
4,0FB0700DAF44,Claim,84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 9...
5,0FB0700DAF44,Position,110 111 112 113 114 115
6,0FB0700DAF44,Evidence,119 120 121 122 123 124 125 126 127 128 129 13...
7,0FB0700DAF44,Claim,315 316 317 318 319 320 321 322 323 324 325 32...
8,0FB0700DAF44,Evidence,341 342 343 344 345 346 347 348 349 350 351 35...
9,0FB0700DAF44,Concluding Statement,559 560 561 562 563 564 565 566 567 568 569 57...
