In [1]:
using CorpusLoaders, DataDeps, WordTokenizers

In [2]:
files = IMDB()

sents = Array{String,1}()
for file in files.filepaths
    open(file) do fileio
        for sent in split_sentences(read(fileio, String))
            push!(sents, sent)
        end
    end
end

length(sents)

126678

In [3]:
################################
#                              #
#      WordTokenizers.jl       #
#                              #
################################

In [4]:
using WordTokenizers
@time nltk_word_tokenize.(sents)

  6.488322 seconds (14.83 M allocations: 783.291 MiB, 11.65% gc time)


126678-element Array{Array{String,1},1}:
 ["Bromwell", "High", "is", "a", "cartoon", "comedy", "."]                                                                                                                             
 ["It", "ran", "at", "the", "same", "time", "as", "some", "other", "programs", "about", "school", "life", ",", "such", "as", "``", "Teachers", "''", "."]                              
 ["My", "35", "years", "in", "the", "teaching", "profession", "lead", "me", "to"  …  "much", "closer", "to", "reality", "than", "is", "``", "Teachers", "''", "."]                     
 ["The", "scramble", "to", "survive", "financially", ",", "the", "insightful", "students", "who"  …  "me", "of", "the", "schools", "I", "knew", "and", "their", "students", "."]       
 ["When", "I", "saw", "the", "episode", "in", "which", "a", "student", "repeatedly"  …  "immediately", "recalled", "...", "...", "...", "at", "...", "...", "...", "."]                
 ["High", "."]                         

In [5]:
################################
#                              #
#            SpaCy             #
#                              #
################################

using PyCall
en = pyimport("spacy.lang.en")
nlp = en.English()
spacy_tokenizer = nlp.Defaults.create_tokenizer(nlp)
@time spacy_tokenizer.(sents)

 24.139985 seconds (1.08 M allocations: 36.156 MiB, 0.14% gc time)


126678-element Array{PyObject,1}:
 PyObject Bromwell High is a cartoon comedy.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
 PyObject It ran at the same time as some other programs about school life, such as "Teachers".                                                                                                                                                                                                                                                                                                                                                                   

In [6]:
################################
#                              #
#             NLTK             #
#                              #
################################

using PyCall
nltk_tok = pyimport("nltk.tokenize")
nltk_tokenizer = nltk_tok.word_tokenize
@time nltk_tokenizer.(sents)

 32.180763 seconds (17.63 M allocations: 495.891 MiB, 4.39% gc time)


126678-element Array{Array{String,1},1}:
 ["Bromwell", "High", "is", "a", "cartoon", "comedy", "."]                                                                                                                             
 ["It", "ran", "at", "the", "same", "time", "as", "some", "other", "programs", "about", "school", "life", ",", "such", "as", "``", "Teachers", "''", "."]                              
 ["My", "35", "years", "in", "the", "teaching", "profession", "lead", "me", "to"  …  "much", "closer", "to", "reality", "than", "is", "``", "Teachers", "''", "."]                     
 ["The", "scramble", "to", "survive", "financially", ",", "the", "insightful", "students", "who"  …  "me", "of", "the", "schools", "I", "knew", "and", "their", "students", "."]       
 ["When", "I", "saw", "the", "episode", "in", "which", "a", "student", "repeatedly"  …  "immediately", "recalled", "...", "...", "...", "at", "...", "...", "...", "."]                
 ["High", "."]                         

In [7]:
##
# Multi - lingual tok-tok tokenizer of WordTokenizers.jl
##

@time tokenize.(sents)

 24.634809 seconds (64.59 M allocations: 4.216 GiB, 14.98% gc time)


126678-element Array{Array{String,1},1}:
 ["Bromwell", "High", "is", "a", "cartoon", "comedy", "."]                                                                                                                             
 ["It", "ran", "at", "the", "same", "time", "as", "some", "other", "programs", "about", "school", "life", ",", "such", "as", "\"", "Teachers", "\"", "."]                              
 ["My", "35", "years", "in", "the", "teaching", "profession", "lead", "me", "to"  …  "much", "closer", "to", "reality", "than", "is", "\"", "Teachers", "\"", "."]                     
 ["The", "scramble", "to", "survive", "financially", ",", "the", "insightful", "students", "who"  …  "me", "of", "the", "schools", "I", "knew", "and", "their", "students", "."]       
 ["When", "I", "saw", "the", "episode", "in", "which", "a", "student", "repeatedly"  …  "to", "burn", "down", "the", "school", ",", "I", "immediately", "recalled", "at"]              
 ["High", "."]                         