This notebook is copied from Phung's Kaggle notebook 'Pytorch BERT for Named Entity Recognition'. Changes are marked in __*italic bold*__.

This notebook shows how to fine-tune a BERT model (from huggingface) for our dataset recognition task.

Note that internet is needed during the training phase (for downloading the bert-base-cased model). Internet can be turned off during prediction.

## Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Hyper-parameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')

Group by publication, training labels should have the same form as expected output.

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

_**Load gov dataset and randomly shuffle.**_

In [None]:
gov_dataDF = pd.read_csv('../input/gov-data/additional_gov_datasets_popular.csv').drop_duplicates()
gov_dataDF.head()

In [None]:
gov_data = [item for sublist in [list(row) for row in gov_dataDF.values] for item in sublist]
random.shuffle(gov_data)
gov_data[:5]

In [None]:
len(gov_data)

# Transform data

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence) for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return list(zip(sentence_words, nes))

__*Create the training data.*__
* _tagged_list1_ consists of positive sentences with a dataset label, filtered that they do not contain any other capital letters (except for the beginning of the sentence). 
* _tagged_list2_ consists of negative sentences containing the words ‘data’ and ‘study’, filtered that they do not contain any other capital letters (except for the beginning of the sentence).
* _tagged_list3_ consists of negative sentences not containing the words ‘data’ and ‘study’, but with capital letters.
* _tagged_list4_ consists of positive sentences taken from the _tagged_list1_ in which the dataset label is replaced by one of the external GOV dataset labels.

__*Variations in methods are the following:*__
* The list with 'data' and 'study' are expanded with 'project', 'program' and 'survey'.
* _tagged_list1_ consists of all positive sentences, not filtered on capital letters (code for this can be found in Phung's original notebook).
* _tagged_list2_ consists of all negative sentences containing the words 'data' and 'study, not filtered on capital letters (code for this can be found in Phung's original notebook).

In [None]:
tagged_list1 = []
tagged_list2 = []
tagged_list3 = []
tagged_list4 = []
cntNO, cnt1L, cntYES, cntGOV = 0, 0, 0, 0

pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # for each sentence
    for sentence in sentences:
        sentence_words = sentence.split()
        
        # sentences with label
        if labels is not None and any(re.findall(f'\\b{label}\\b', sentence) for label in labels): # find all positive samples           
            # sentences with label and no other capitals, most likely to not have unknown dataset labels
            for label in labels:
                sentence2 = sentence.replace(label, label.lower()) # replace dataset label by lowercase to recognize other capitals
                sentence3 = sentence2[0].lower() + sentence2[1:] # do not take first character of sentence into account
                                
                if sentence3.islower():
                    cnt1L+=1
                    tagged_list1.append(tag_sentence(sentence, labels))
                    
                    # add sentences with another dataset from external dataset
                    if cnt1L*1 < len(gov_data)-1:
                        cntGOV+=1
                        sentenceNew1 = sentence.replace(label, gov_data[cnt1L*1])
                        tagged_list4.append(tag_sentence(sentenceNew1, [gov_data[cnt1L*1]]))
                    if cnt1L*2 < len(gov_data)-1:
                        cntGOV+=1
                        sentenceNew2 = sentence.replace(label, gov_data[cnt1L*2])
                        tagged_list4.append(tag_sentence(sentenceNew2, [gov_data[cnt1L*2]]))
                    if cnt1L*3 < len(gov_data)-1:
                        cntGOV+=1
                        sentenceNew3 = sentence.replace(label, gov_data[cnt1L*3])
                        tagged_list4.append(tag_sentence(sentenceNew3, [gov_data[cnt1L*3]]))
                    
        # sentences with no capitals when data or study is mentioned, most likely to not have unknown dataset labels
        if any(word in sentence.lower() for word in ['data', 'study']):
            sentenceL = sentence[0].lower() + sentence[1:] # do not take first character of sentence into account
            if sentenceL.islower():
                cntNO+=1
                tagged_list2.append(tag_sentence(sentence, labels))
                
        # sentences with capitals but not including data or study
        if not any(word in sentence.lower() for word in ['data', 'study']): 
            sentenceL = sentence[0].lower() + sentence[1:]
            if not sentenceL.islower():
                cntYES+=1
                tagged_list3.append(tag_sentence(sentence, labels))
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt1L} one label, {cntNO} no label, {cntYES} basic, {cntGOV} gov label")

__*Merge the tagged_lists together with desired sizes (differs per method).*__

In [None]:
tagged_list = tagged_list1 + random.sample(tagged_list2, int(len(tagged_list2)/2)) + random.sample(tagged_list3, int(len(tagged_list2)/2)) + tagged_list4

#shuffling
random.shuffle(tagged_list)

In [None]:
print(len(tagged_list1))
print(len(tagged_list2))
print(len(tagged_list3))
print(len(tagged_list4))
print(len(tagged_list))

In [None]:
print(tagged_list[:5])

write data to file.

In [None]:
with open('train_ner.json', 'w') as f:
    for row in tagged_list:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune a BERT model for NER

In [None]:
!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'bert-base-cased' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 

After the tuning finishes, we should find our model in './output'.