In [1]:
using CorpusLoaders, DataDeps, WordTokenizers
using BenchmarkTools

In [2]:
files = IMDB()

sents = Array{String,1}()
for file in files.filepaths
    open(file) do fileio
        for sent in split_sentences(read(fileio, String))
            push!(sents, sent)
        end
    end
end

length(sents)

126678

In [3]:
################################
#                              #
#      WordTokenizers.jl       #
#                              #
################################
using WordTokenizers

In [4]:
########################
#  NLTK Word Tokenize  #
########################

@time nltk_word_tokenize.(sents)
@time nltk_word_tokenize.(sents)
@time nltk_word_tokenize.(sents)
println()

  6.566557 seconds (14.83 M allocations: 783.295 MiB, 12.03% gc time)
  6.086671 seconds (14.02 M allocations: 743.694 MiB, 13.42% gc time)
  6.116448 seconds (14.02 M allocations: 743.694 MiB, 13.40% gc time)



In [5]:
@benchmark nltk_word_tokenize.(sents)

BenchmarkTools.Trial: 
  memory estimate:  743.69 MiB
  allocs estimate:  14019441
  --------------
  minimum time:     6.223 s (12.20% GC)
  median time:      6.223 s (12.20% GC)
  mean time:        6.223 s (12.20% GC)
  maximum time:     6.223 s (12.20% GC)
  --------------
  samples:          1
  evals/sample:     1

In [6]:
########################
#   Tweet Tokenizerr   #
########################

@time tweet_tokenize.(sents)
@time tweet_tokenize.(sents)
@time tweet_tokenize.(sents)
println()

  6.638738 seconds (45.11 M allocations: 2.907 GiB, 23.30% gc time)
  5.374411 seconds (43.04 M allocations: 2.808 GiB, 30.34% gc time)
  5.483399 seconds (43.04 M allocations: 2.808 GiB, 31.05% gc time)



In [7]:

@benchmark tweet_tokenize.(sents)

BenchmarkTools.Trial: 
  memory estimate:  2.81 GiB
  allocs estimate:  43040936
  --------------
  minimum time:     5.286 s (29.07% GC)
  median time:      5.286 s (29.07% GC)
  mean time:        5.286 s (29.07% GC)
  maximum time:     5.286 s (29.07% GC)
  --------------
  samples:          1
  evals/sample:     1

In [8]:
########################
#   TokTok tokenizer   #
########################

@time tokenize.(sents)
@time tokenize.(sents)
@time tokenize.(sents)
println()

 23.802682 seconds (64.58 M allocations: 4.216 GiB, 10.52% gc time)
 23.483375 seconds (63.77 M allocations: 4.177 GiB, 10.04% gc time)
 24.013465 seconds (63.77 M allocations: 4.177 GiB, 10.73% gc time)



In [9]:
@benchmark tokenize.(sents)

BenchmarkTools.Trial: 
  memory estimate:  4.18 GiB
  allocs estimate:  63768415
  --------------
  minimum time:     24.109 s (9.67% GC)
  median time:      24.109 s (9.67% GC)
  mean time:        24.109 s (9.67% GC)
  maximum time:     24.109 s (9.67% GC)
  --------------
  samples:          1
  evals/sample:     1

In [10]:
########################
# Reversible Tokenizer #
########################

@time rev_tokenize.(sents)
@time rev_tokenize.(sents)
@time rev_tokenize.(sents)
println()

  3.084606 seconds (13.39 M allocations: 503.096 MiB, 28.98% gc time)
  2.374276 seconds (12.96 M allocations: 482.001 MiB, 17.61% gc time)
  1.928716 seconds (12.96 M allocations: 482.001 MiB)



In [11]:
@benchmark rev_tokenize.(sents)

BenchmarkTools.Trial: 
  memory estimate:  482.00 MiB
  allocs estimate:  12955494
  --------------
  minimum time:     1.926 s (0.00% GC)
  median time:      2.380 s (20.46% GC)
  mean time:        2.260 s (15.21% GC)
  maximum time:     2.474 s (22.01% GC)
  --------------
  samples:          3
  evals/sample:     1

In [12]:
################################
#                              #
#            SpaCy             #
#                              #
################################

using PyCall
en = pyimport("spacy.lang.en")
nlp = en.English()
spacy_tokenizer = nlp.Defaults.create_tokenizer(nlp)

PyObject <spacy.tokenizer.Tokenizer object at 0x7f5d954942a0>

In [13]:
@time spacy_tokenizer.(sents)
@time spacy_tokenizer.(sents)
@time spacy_tokenizer.(sents)
println()

 27.217917 seconds (1.08 M allocations: 35.951 MiB, 0.06% gc time)
 23.444356 seconds (886.25 k allocations: 26.087 MiB, 0.13% gc time)
 22.038129 seconds (886.25 k allocations: 26.087 MiB, 0.12% gc time)



In [14]:
@benchmark spacy_tokenizer.(sents)

BenchmarkTools.Trial: 
  memory estimate:  26.09 MiB
  allocs estimate:  886243
  --------------
  minimum time:     21.979 s (0.08% GC)
  median time:      21.979 s (0.08% GC)
  mean time:        21.979 s (0.08% GC)
  maximum time:     21.979 s (0.08% GC)
  --------------
  samples:          1
  evals/sample:     1

In [15]:
################################
#                              #
#             NLTK             #
#                              #
################################

using PyCall
nltk_tok = pyimport("nltk.tokenize")
nltk_tokenizer = nltk_tok.word_tokenize

PyObject <function word_tokenize at 0x7f5d93a6e1e0>

In [16]:
@time nltk_tokenizer.(sents)
@time nltk_tokenizer.(sents)
@time nltk_tokenizer.(sents)
println()

 32.385245 seconds (17.41 M allocations: 485.114 MiB, 2.10% gc time)
 32.642742 seconds (16.76 M allocations: 451.949 MiB, 2.34% gc time)
 32.291003 seconds (16.76 M allocations: 451.949 MiB)



In [17]:
@benchmark nltk_tokenizer.(sents)

BenchmarkTools.Trial: 
  memory estimate:  451.95 MiB
  allocs estimate:  16761585
  --------------
  minimum time:     34.053 s (1.35% GC)
  median time:      34.053 s (1.35% GC)
  mean time:        34.053 s (1.35% GC)
  maximum time:     34.053 s (1.35% GC)
  --------------
  samples:          1
  evals/sample:     1