# corpus bootstrapping with LM perplexity

toy test of Ramaswamy, Printz, Gopalakrishnan: *A Bootstrap Technique for Building Domain-Dependent Langauge Models*
http://mirlab.org/conference_papers/International_Conference/ICSLP%201998/PDF/SCAN/SL980611.PDF

- uses KenLM n-gram LM with backoff and smoothing
- in-domain data from Jane Austen
- 'unlabeled' corpus of Austen,Carroll and Melville sentences

In [1]:
from collections import Counter
import kenlm
import math
import nltk
import random
import re
import subprocess

## setup paths here

In [2]:
# setup paths - hi kcrong
corpus_path = '/home/derek/PycharmProjects/perplexitybootstrapping/'
process_path = '/home/derek/PycharmProjects/perplexitybootstrapping/'
kenlm_path = '/home/derek/kcrong_stuff/kenlm/build/bin/'

# in-domain corpus data

load the three jane austen texts as tokenized sentences, preprocess (lowercase, remove punctuation etc, add `<s>` and `</s>` tags)

In [3]:
# read corpora
austen1 = nltk.corpus.gutenberg.sents('austen-emma.txt')
austen2 = nltk.corpus.gutenberg.sents('austen-persuasion.txt')
austen3 = nltk.corpus.gutenberg.sents('austen-sense.txt')
data = austen1 + austen2 + austen3
print(len(data))

16498


In [4]:
%%time
# shuffle data and withhold random set
indices = [i for i in range(len(data))]
random.shuffle(indices)
data = [data[i] for i in indices]

test_idx = int(len(data)*0.25)
corpus = data[:test_idx]
withheld = data[test_idx:]
data = None # clear
print(len(corpus), len(withheld))

4124 12374
CPU times: user 6.01 s, sys: 84 ms, total: 6.09 s
Wall time: 6.09 s


## preprocessing

remove sents of len < 5 (words)

In [5]:
%%time
# preprocess data (with function)
def preprocess(tokens):
    processed = []
    for sent in tokens:
        if len(sent) > 6:
            this_sent = []
            for word in sent:
                if re.findall(r'[0-9A-Za-z]+', word):
                    this_sent.append(word.lower())
            processed.append(this_sent)
    return processed

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.81 µs


In [6]:
corpus = preprocess(corpus)
withheld = preprocess(withheld)
print("seed corpus:", len(corpus), "withheld:", len(withheld))

seed corpus: 3601 withheld: 10894


## write corpus file 

In [7]:
# save the text file for training
def writecorpus(lol, filename='corpus.txt'):
    with open(filename, 'w') as f:
        for line in lol:
            f.write(' '.join(line))
            f.write('\n')
            
    subprocess.run(["bzip2", filename])
    return

In [8]:
writecorpus(corpus)

# functions for language model

we will constuct this as a function so we can iterate

In [9]:
# save bashscript
def trainmodel(filename="corpus"):
    
    with open("train.sh", "w") as f:
        f.write
        f.write("bzcat " + filename + ".txt.bz2 | python process.py | " + kenlm_path + "lmplz -o 3 > "+filename+".arpa")
    
    subprocess.run(["sh", "train.sh"])
    
    subprocess.run([kenlm_path + "build_binary", filename+".arpa", filename+".klm"])
    
    return

In [10]:
# test
trainmodel()

# external corpus

this is data from an external source that (hopefully) includes some sentences that we can use for data augmentation.

here we (artificially) create a mixed id/ood corpus by mixing our withheld data in with some text from another source. we will use moby dick because it is one of the NLTK prose texts closer in time to Jane Austen

for testing, we will label each sentence according to source

In [11]:
withheld = [('austen', ' '.join(s)) for s in withheld]
len(withheld)

10894

In [12]:
melville = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')
melville = preprocess(melville)
melville = [('melville', ' '.join(s)) for s in melville]

carroll = nltk.corpus.gutenberg.sents('carroll-alice.txt')
carroll = preprocess(carroll)
carroll = [('carroll', ' '.join(s)) for s in carroll]

len(melville), len(carroll)

(8213, 1360)

In [13]:
unlabeled = withheld + melville + carroll
len(unlabeled)

20467

# test: sort by perplexity score

as we can see, in-domain answers are at the top. of course it is not the case that necessarily all *(true)* in-domain sentences are at the top of the list.

In [14]:
model = kenlm.LanguageModel('corpus.klm')

In [15]:
model.perplexity('hello i do not know')

88.77515938004238

In [16]:
unlabeled[0]

('austen',
 'upon my soul it is was his answer with a warmth which brought all the former willoughby to her remembrance and in spite of herself made her think him sincere')

In [17]:
%%time
# get perplexities
perplexities = [model.perplexity(s[1]) for s in unlabeled]

CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 91.9 ms


In [18]:
# sort by perplexity (lower = better)
[(x[0], y) for x, y in sorted(zip(unlabeled, perplexities), key=lambda pair: pair[1])][:20]

[('austen', 6.3236573447714175),
 ('austen', 6.817210747019258),
 ('austen', 7.574451666930565),
 ('austen', 7.936022492080543),
 ('austen', 8.79034807095688),
 ('austen', 8.881279896552586),
 ('austen', 8.89872936126102),
 ('austen', 8.980324484642574),
 ('austen', 8.980849898236679),
 ('austen', 9.065218827220512),
 ('austen', 9.093119024819847),
 ('austen', 9.107500910419441),
 ('austen', 9.211499683269667),
 ('austen', 9.299379676002628),
 ('austen', 9.690128370193325),
 ('austen', 9.864419361959225),
 ('austen', 9.954134993801132),
 ('austen', 10.10226018695627),
 ('austen', 10.13428233410811),
 ('austen', 10.148590197400718)]

# iterate

this is meant to be an iterative algorithm, so we add the top sentences (using threshold) to the original training data, make a new language model, and calculate new perplexity scores over the outside data.

In [19]:
iters = 500

add_corpus = corpus[:]       # the expanding id-corpus
rem_unlabeled = unlabeled[:] # the shrinking unlabeled data
additions = []               # track additions to lm corpus
threshhold = 50.0            # perplexity threshhold
cutoff = 20                  # cutoff for added sents, make large to 'ignore'

for i in range(iters):
    
    # EarlyStopping
    cnt = 0
    
    # indices to remove from unlabeled data
    remove_idx = []
    
    # write corpus
    writecorpus(add_corpus, filename='addcorpus.txt')
    
    # build language model
    trainmodel(filename='addcorpus')
    
    # load model
    model = kenlm.LanguageModel('addcorpus.klm')
    
    # get perplexities
    perplexities = [model.perplexity(s[1]) for s in rem_unlabeled]
    
    # indices, sort perplexities
    indices = [i for i in range(len(perplexities))]
    sorted_perplexities = [(x, y) for x, y in sorted(zip(indices, perplexities), key=lambda pair: pair[1])]
    
    # take top sents
    add = 0
    for jdx, tup in enumerate(sorted_perplexities):
        idx = tup[0]
        perp = tup[1]
        if perp < threshhold:
            additions.append(rem_unlabeled[idx])
            add_corpus.append(rem_unlabeled[idx][1])
            remove_idx.append(idx)
            cnt += 1
            add += 1
        if add == cutoff:
            break
    
    # filter out additions
    rem_unlabeled = [rem_unlabeled[i] for i in range(len(rem_unlabeled)) if i not in remove_idx]
    
    # if no added sents, terminate
    if cnt == 0:
        print("no added sentences, stopping...\n")
#         debug = [(x[0], ' '.join(x[1]), y) for x, y in sorted(zip(rem_unlabeled, perplexities), key=lambda pair: pair[1])][:100]
#         for d in debug[:10]:
#             print(d)
        break
    
    if i > 0 and i % 10 == 0:
        print("iter", i, ": total added", len(additions), "sents")
            

iter 10 : total added 220 sents
iter 20 : total added 420 sents
iter 30 : total added 620 sents
iter 40 : total added 820 sents
iter 50 : total added 1020 sents
iter 60 : total added 1220 sents
iter 70 : total added 1420 sents
iter 80 : total added 1620 sents
iter 90 : total added 1820 sents
iter 100 : total added 2020 sents
iter 110 : total added 2220 sents
iter 120 : total added 2420 sents
iter 130 : total added 2620 sents
iter 140 : total added 2820 sents
iter 150 : total added 3020 sents
no added sentences, stopping...



## evaluation

In [20]:
diff = len(add_corpus) - len(corpus)
totl = len(unlabeled)
print("sents found:", diff, "(%", diff*100/totl, "of unlabeled)")

sents found: 3151 (% 15.39551473103044 of unlabeled)


In [21]:
labels = [t[0] for t in additions]
corrects = [t[0] for t in additions if t[0]=='austen']
print("precision of found sents: ", len(corrects)/len(labels))

precision of found sents:  0.9920660107902253


In [22]:
punlabeled = len(withheld)/len(withheld + melville + carroll)
print("percentage of trues in unlabeled:", punlabeled)

percentage of trues in unlabeled: 0.5322714613768506


In [23]:
recall = len(corrects)/len(withheld)
print("recall of unlabeled austen sents: ", recall)

recall of unlabeled austen sents:  0.2869469432715256
