In [1]:
#!pip install scispacy
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_core_sci_sm-0.2.3.tar.gz
#!pip install nltk
#!pip install tensorflow
#!pip install --user sentencepiece
    
import os
import sys
import json
#import nltk
import random
import logging
import sentencepiece as spm
#import scispacy

In [None]:
!tail -n 1 pubmed_dump.txt

In [None]:
import spacy
SENT = "The study was carried out in a borough of London where a there is a disparity of wealth and a large ethnic minority population and therefore may be different to many other areas of the UK. However, the findings have similarities to those of other UK studies examining the patient perspective of provision and uptake of care (17,19), which agree that GPs could provide more information and be more proactive in respect of preconception care provision. In addition, we found that GPs were more likely to provide preconception care to women with medical conditions, and this targeted approach highlighted issues similar to those found by Mortagy et al. (46) who interviewed GPs and secondary care health professionals focusing on women with diabetes."

In [None]:
nlp = spacy.load("en_core_sci_sm")

In [None]:
nlp(SENT.lower())

In [None]:
import tqdm

from tqdm import tqdm
total_lines = 15809286 # counted via wc -l
import multiprocessing as mp
import uuid

NUM_WORKERS = 19

def worker(job_q):
    with open(f"data/{uuid.uuid4()}", "w+") as file_obj:
        while True:
            paragraph = job_q.get()

            if paragraph is None:
                break

            processed_text = nlp(paragraph.lower())
            file_obj.write(processed_text.text.strip()+'\n')

job_queue = mp.Queue(maxsize=NUM_WORKERS)

pool = mp.Pool(NUM_WORKERS, initializer=worker, initargs=(job_queue,))

with open("xaa", encoding='utf-8') as input:
    for paragraph in tqdm(input, total=total_lines):
        job_queue.put(paragraph)

    for _ in range(NUM_WORKERS):
        job_queue.put(None)

    pool.close()
    pool.join()

In [2]:
MODEL_PREFIX = "tokenizer" #@param {type: "string"}
VOC_SIZE = 32000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 20000000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}
PRC_DATA_FPATH = "data/pubmed_processed.txt"

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1 --num_threads=8').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [None]:
testcase = "Colorless geothermal substations are generating furiously"

In [None]:
!head -n 100 tokenizer.vocab

In [None]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

As we may observe, SentencePiece does quite the opposite to WordPiece. From the documentation:

SentencePiece first escapes the whitespace with a meta-symbol "▁" (U+2581) as follows:

Hello▁World.

Then, this text is segmented into small pieces, for example:

[Hello] [▁Wor] [ld] [.]

Subwords which occur after whitespace (which are also those that most words begin with) are prepended with '▁', while others are unchanged. This excludes subwords which only occur at the beginning of sentences and nowhere else. These cases should be quite rare, however.

So, in order to obtain a vocabulary analogous to WordPiece, we need to perform a simple conversion, removing "▁" from the tokens that contain it and adding "##" to the ones that don't.

In [None]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token

In [None]:
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

In [None]:
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

In [None]:
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

In [None]:
VOC_FNAME = "vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [None]:
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME)
bert_tokenizer.tokenize(testcase)