# Create Wordlist

Create a vocabulary list for Bookworm, optimized to include top language-specific tokens.

In [None]:
import pandas as pd
import dask.dataframe as dd

In [None]:
import os
st = os.stat('/notebooks/data2/final/final-sorted.h5')
st.st_size / 1024**3

4.959613915532827

In [None]:
# Get all langs and their sizes
with pd.HDFStore('/notebooks/data2/final/final-sorted.h5') as store:
    keys = store.keys()
    sizes = [store.get_storer(key).nrows for key in keys]
tablesizes = pd.Series(sizes, index=keys).sort_values(ascending=False)
tablesizes.head(5)

/eng    79005095
/ger    30446373
/fre    17440715
/lat    13691932
/rus    10851839
dtype: int64

In [None]:
with pd.HDFStore('/notebooks/data2/final/final-sorted.h5') as store:
    df = store.select('/gre')
df[df.index.str.startswith('Ελλάδα')]

(2230249, 1)

In [None]:
df[df.index.str.startswith('Ελλάδα')]

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
Ελλάδα,302969
Ελλάδας,53219
Ελλάδα»,4525
Ελλάδας»,771
Ελλάδα...,507
"Ελλάδα""",313
Ελλάδα*,162
Ελλάδας-Τουρκίας,107
Ελλάδαδ,102
Ελλάδα...»,80


# Determine a trimming policy for each lang

Each language contributes top $N_{lang}$ tokens to the word list. $N_{lang}$ is selected according to the following rules:

    4% of the language's saved vocabulary, to a minimum of 25k, and hard-coded adjustments for the biggest languages where 4% is too high (eng=900m, ger=650k, {fre,lat,rus}=400k, {jpn,ita,spa}=250k). Any language with less than 100k tokens *total* is assumed to be a junk language, or one that BW is not useful for to begin with, so it's trimmed.

In [None]:
# Reference for how many top words to keep
top_words_ref = dict(eng=900000, ger=650000,
                     fre=400000, lat=360000, rus=280000,
                     jpn=300000, ita=220000, spa=220000)

def trim_topwords(row):
    if row[0][1:] in top_words_ref:
        return top_words_ref[row[0][1:]]
    elif row[1] < 100000:
        # Ignore langs with practically no words as likely duds, or at the very least
        # something BW wouldn't be useful for
        return 0
    else:
        # Other languages: keep greater of 25k or 5% of vocab
        mincount = 20000
        percentagetrim = int(row[1] * 0.035)
        return percentagetrim if percentagetrim > mincount else mincount

cutoff_list = tablesizes.reset_index().rename(columns={'index': 'lang', 0: 'count'})
cutoff_list['retain_count'] = cutoff_list.apply(trim_topwords, axis=1)
print("Total tokens (including possible dupes)", cutoff_list['retain_count'].sum())
cutoff_list.head(10)

Total tokens (including possible dupes) 6588484


Unnamed: 0,lang,count,retain_count
0,/eng,79005095,900000
1,/ger,30446373,650000
2,/fre,17440715,400000
3,/lat,13691932,360000
4,/rus,10851839,280000
5,/jpn,8333906,300000
6,/ita,7069154,220000
7,/spa,7027856,220000
8,/chi,5210120,182354
9,/und,4886094,171013


In [None]:
dfs = []
problem_dfs = []
for i, row in cutoff_list.iterrows():
    if row['retain_count'] == 0:
        continue
    df = pd.read_hdf('/notebooks/data2/final/final-sorted.h5', row['lang'], stop=1000000)
    
    count = df[(df.index.str.startswith('\u200b') | df.index.str.endswith("\u200b"))]
    if count.shape[0] != 0:
        print("%s has %d tokens withs \\u200b in the top 1m" % (row['lang'], count.shape[0]))

Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/eng has 927 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/ger has 195 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/fre has 455 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/lat has 280 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/rus has 146 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/jpn has 280105 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/ita has 398 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/spa has 686 tokens withs \u200b in the top 1m
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
/chi has 267566 tokens withs \u200b 

In [None]:
dfs = []
problem_dfs = []
for i, row in cutoff_list.iterrows():
    if row['retain_count'] == 0:
        continue
    df = pd.read_hdf('/notebooks/data2/final/final-sorted.h5', row['lang'], stop=row['retain_count'])
    # Save Japanese and Chinese chars with \u200b char
    if row['lang'] in ['/jpn', '/chi', '/kor', '/arm', '/urd']:
        problems = df[(df.index.str.startswith('\u200b') | df.index.str.endswith("\u200b"))]
        problem_dfs.append(problems)
    dfs.append(df)
    
asn_probchars = pd.concat(problem_dfs).groupby(level='token').sum().sort_values('count', ascending=False)
wordlist = pd.concat(dfs).groupby(level='token').sum().sort_values('count', ascending=False)
print("Final wordlist using top N trim criteria: ", wordlist.shape)

Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/final/final-sorted.h5 in read-only mode
Opening /notebooks/data2/

## Testing trim policy

In [None]:
# Grab a 1000 word chunk starting at 'start'
start = 9*10**5
lang = '/eng'
test_tokens = pd.read_hdf('/notebooks/data2/final/final-sorted.h5', lang,
                          start=start, stop=start+1000)

Opening /notebooks/data2/final/final-sorted.h5 in read-only mode


In [None]:
# Randomly sample, so that you're not biased just to the 
# same ten tokens at the start of the list. Try re-running this cell
test_tokens.sample(10) 

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
3ul,12451.0
Clcrk,12447.0
Weckesser,12436.0
|Feb.,12449.0
Griiber,12441.0
unpeople,12442.0
(8-20,12445.0
18.(,12447.0
palstaves,12439.0
25084,12449.0


## Testing rules for removing likely junk

In [None]:
# The re module is broken for hindi and similar characters, need to use regex
import regex

tokens = wordlist.index
hyphenated = tokens.str.contains(r"-")
#alpha = tokens.str.isalpha() # Faster, but bad for some languages
alphaadv = tokens.map(lambda x: not not regex.search("^\\w+$", x))
number = tokens.str.contains(r"^(£|$|€)?[\d.,]+(st|nd|rd|th|s|C|F|c|m|°|¥)?$")
singlequote = tokens.str.contains(r"[\'’]")
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")
endwithperiod = tokens.str.endswith('.')
# This shows up for many asian characters, should be dealt with *before* wordlist is created
blankchar = (tokens.str.startswith('\u200b') | tokens.str.endswith("\u200b"))
tlen = tokens.str.len()



In [None]:
df = pd.read_hdf('/notebooks/data2/final/final-sorted.h5', 'hin', stop=1000000)
df.head()

Opening /notebooks/data2/final/final-sorted.h5 in read-only mode


Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
े,318993347
ा,307119616
ी,254023272
क,209518823
ं,187441034


In [None]:
tokens = df.index
hyphenated = tokens.str.contains(r"-")
alpha = tokens.str.isalpha() # Faster, but bad for some languages
#alphaadv = tokens.map(lambda x: not not regex.search("^\\w+$", x))
number = tokens.str.contains(r"^(£|$|€)?[\d.,]+(st|nd|rd|th|s|C|F|c|m|°|¥)?$")
singlequote = tokens.str.contains(r"[\'’]")
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")
endwithperiod = tokens.str.endswith('.')
# This shows up for many asian characters, should be dealt with *before* wordlist is created
blankchar = (tokens.str.startswith('\u200b') | tokens.str.endswith("\u200b"))
tlen = tokens.str.len()



In [None]:
df[~hyphenated & ~alpha & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number & ~abbr & ~blankchar].index.values[:100]

array(['मे', 'है', 'नही', 'होत', 'जात', 'किय', 'लिए', 'मै', 'हुए', 'कुछ',
       'रूप', 'हुआ', 'हैं', 'दिय', 'होन', 'मेर', 'किस', 'कोई', 'साथ',
       'जान', 'प्रकार', 'लिय', 'बात', 'वाल', 'हुई', 'क्य', 'फिर', 'द्वार',
       'जैस', 'कारण', 'देत', 'जीवन', 'बहुत', 'देख', 'नाम', 'मुझ', 'होग',
       'दोनो', 'श्र', 'हू', 'उन्होंन', 'अधिक', 'जिस', 'यहा', 'बाद', 'देन',
       'दूसर', 'हिन्द', 'दिन', 'प्राप्त', 'मिल', 'मान', 'हमार', 'उन्हे',
       'तुम', 'लेकिन', 'बार', 'था', 'काम', 'राज', 'साहित्य', 'किन्त',
       'लिख', 'हो', 'केवल', 'जाय', 'होकर', 'व्यक्त', 'यही', 'पास', 'भाष',
       'भारत', 'देश', 'शब्द', 'अन्य', 'चाहिए', 'लोग', 'अनेक', 'दृष्ट',
       'मैंन', 'थी', 'जिसक', 'अर्थ', 'कही', 'मिलत', 'काल', 'हाथ', 'चाहत',
       'स्थान', 'प्रत', 'सार', 'स्थित', 'परन्त', 'चुक', 'भारतीय', 'भाव',
       'प्रयोग', 'वही', 'विचार', 'वहा'], dtype=object)

### Expected junk

Top of the list:

In [None]:
junk = wordlist[~hyphenated & ~alphaadv & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number & ~abbr & ~blankchar]
junk.head(10)

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
&c,85989610.0
.e,69607602.0
है,69227316.0
)(,66416410.0
मे,65622944.0
2d,63129683.0
.),60533303.0
**,54022272.0
n°,53972710.0
8vo,48993113.0


And a random selection:

In [None]:
junk.sample(10)

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
«ll«,29142.0
de¿,15773.0
小^,2912.0
fact—which,14752.0
",ſcd",5747.0
l<u3,101.0
.Kjobenhavn,5371.0
.+,907535.0
re^ulam,7423.0
nach»,236133.0


### JPN and CHI fix

Any characters in the /jpn and /chi lists that have a non-breaking line space will be added, but with the id of the cleaned version. If there is no cleaned version, add one to the word list.

In [None]:
final_candidate = wordlist[hyphenated | alphaadv | (tlen < 2) | endwithperiod | singlequote | number | abbr | blankchar].reset_index()

In [None]:
prob_chars = asn_probchars.reset_index().query('token != "\u200b"')
prob_chars['broken'] =prob_chars['token']
prob_chars['token'] = prob_chars['broken'].str.replace('\u200b', '')

In [None]:
# Problem characters that are not in the wordlist: add them (as fixed version)
to_add = np.setdiff1d(prob_chars['token'].values, final_candidate['token'].values)
new_lines = prob_chars[prob_chars['token'].isin(to_add)][['token', 'count']]
final = pd.concat([final_candidate, new_lines])\
        .groupby('token', as_index=False).sum()\
        .sort_values('count', ascending=False)\
        .reset_index(drop=True)\
        .reset_index()

In [None]:
# The indices to for the fixed characters. When we encounter the broken words in the dataset, we'll encode then
# with the id for the fixed token
problemchar_indices = pd.merge(final, prob_chars[['token', 'broken']], on='token')[['index','broken']]
problemchar_indices.sample(3)

Unnamed: 0,index,broken
123379,3204727,​절감​하고
37404,1636808,​투
72499,2700091,​一​‥一


## Save final list

Also, save /jpn and /chi fixes

In [None]:
# OVERWRITE MODE
with pd.HDFStore('/notebooks/data2/final/wordlist.h5', complib='blosc', mode='w', complevel=9) as store:
    store.append('/final', final)
    store.append('/fixes', problemchar_indices)

# Test against dictionary

In [None]:
from htrc_features import FeatureReader, utils

# Two copies of the same dictionary, Laird and Lee's Webster's. They capitalize their words, so
# I'm looking for capital words that occur in both.
dicts = ['loc.ark:/13960/t84j1sb5j', 'loc.ark:/13960/t3xs70k06']
paths = ['/notebooks/features/' + utils.id_to_rsync(volid) for volid in dicts]
fr = FeatureReader(paths)
tokenlist = []
for vol in fr.volumes():
    tokenlist += vol.tokens()

tokens = pd.Series(tokenlist)
# Grab capitalized letters
dictionary_words = tokens[tokens.str.contains(r"^[A-Z][A-Z\-]*$")].value_counts()
shortlist = dictionary_words[dictionary_words > 1].index.str.lower().values

In [None]:
unique_final_lower = final['token'].str.lower().unique()

In [None]:
extradictwords = np.setdiff1d(shortlist, unique_final_lower)

In [None]:
pd.Series(extradictwords).sample(50)

4488           ostracizing
6507              vaticide
5673             spikefish
5296        salubriousness
5484            shellpboof
3131            faith-cure
5981         tassel-flower
4336               nautili
3820    irreconcilableness
359             appeasable
2326          dermatophone
3607               hushaby
318             antitheism
3621             hygienics
441               ascidium
5796         stormy-petrel
1965          crossed-wire
4636            peccancies
2855               dunfish
4539               ozonous
5959              tanistry
2538        discontinuable
3821         irreligiously
1395           cheese-cake
5985              tatizing
6146       through-lighted
6545            vesiculous
3922               keratol
5495         ship-chandler
6330               tu-whit
905               blazoner
1507            clangoring
178            alleviatory
4997               quizzer
3372          germiculture
240              analepsis
6317           tropic-bird
6

In [None]:
shortlist.shape[0], extradictwords.shape[0], 1-extradictwords.shape[0]/shortlist.shape[0]

(44169, 6814, 0.8457289048880436)

## Junk filter testing, here be dragons

Keeping this here as an example of how I tested various matching criteria.

In [None]:
eng_2m = dd.read_hdf('/notebooks/data2/final/final-sorted.h5', '/eng', stop=2000000).compute()

In [None]:
tokens = eng_2m.index
alpha = tokens.str.isalpha()
digit = tokens.str.isdigit()
tlen = tokens.str.len()
endwithperiod = tokens.str.endswith('.')
quotes = (tokens.str.startswith('"') | tokens.str.endswith('"')) # | tokens.str.startswith('\'') | tokens.str.endswith('\''))
endash = (tokens.str.startswith('—') | tokens.str.endswith('—'))
punccount = tokens.str.count('[\W]')
repeating = tokens.str.contains(r"(([\w\W])\2{3,})")
repeatingdigit = tokens.str.contains(r"((\d)\2{3,})")
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")
singlequote = tokens.str.contains(r"\'")
hyphenated = tokens.str.contains(r"-")
number = tokens.str.contains(r"^(£|$|€)?[\d.,]+(st|nd|rd|th|s|C|F|c|m|°|¥)?$")



In [None]:
eng_2m[~alpha & (tlen > 1) & ~endwithperiod & (punccount >= 1)].head(10)

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
's,2447435000.0
n't,265175800.0
'd,138601300.0
--,81639210.0
&c,68468410.0
.e,61694320.0
'll,60031690.0
'',58697510.0
)(,57121120.0
.1,53534810.0


In [None]:
a = eng_2m[~hyphenated & ~alpha & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number & ~abbr]
print("\t".join(a[:300].index.values))

ValueError: operands could not be broadcast together with shapes (4047440,) (2000000,) 

In [None]:
eng_2m[~alpha & abbr & ~endwithperiod & (tlen > 2)]

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
N.Y,19090114.0
e.g,9622439.0
N.J,5433056.0
___,4738726.0
a.d,4416815.0
N.C,4102392.0
S.Ct,3522386.0
m.p,2893462.0
n.d,2620175.0
F.Supp,2236074.0


In [None]:
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")

  if __name__ == '__main__':


In [None]:
eng_2m[~hyphenated & ~abbr & ~alpha & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number].shape[0] /2000000

0.176438

In [None]:
# Longest digit-only values
eng_2m[digit & (tlen > 10)]

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
000000000000,266011.0
11111111111,156061.0
00000000000,146507.0
111111111111,132444.0
00000000000000,101206.0
0000000000000,96323.0
1111111111111,90377.0
000000000000000,67591.0
11111111111111,60583.0
0000000000000000,57728.0


In [None]:
# Long words
pd.set_option('display.float_format', lambda x: '%.0f' % x)
longwords = eng_2m[alpha].reset_index().copy()
longwords['chars'] = longwords['token'].str.len()
longwords.head(1)
longwords.groupby('chars').apply(lambda x: x.sort_values('count').iloc[-1] )[['token', 'count']][:-1]

Unnamed: 0_level_0,token,count
chars,Unnamed: 1_level_1,Unnamed: 2_level_1
1,a,13254850187
2,of,34517225417
3,the,49027500884
4,that,7243618116
5,which,3549593200
6,should,864776997
7,between,612977820
8,American,304614658
9,following,335142138
10,University,241811028


In [None]:
eng_nonalpha = eng_2m[~eng_2m.index.str.isalpha()]
eng_nonalpha.shape

(912474, 1)

In [None]:
eng_nonalphanumeric = eng_nonalpha[~eng_nonalpha.index.str.isdigit()]
eng_nonalphanumeric.shape

(51845610, 1)