##NLTK Word Tokenize

In [None]:
import nltk 
nltk.download('punkt')
from nltk.tokenize import (word_tokenize,
                          sent_tokenize,
                          TreebankWordTokenizer,
                          wordpunct_tokenize,
                          TweetTokenizer,
                          MWETokenizer)

text="Hope, is the only thing stronger that fear! #Hope #Amal.M"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#word Tokenizer
print(word_tokenize(text))

['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'that', 'fear', '!', '#', 'Hope', '#', 'Amal.M']


In [None]:
#Sentence Tokenizer
print(sent_tokenize(text))

['Hope, is the only thing stronger that fear!', '#Hope #Amal.M']


In [None]:
#Punctuation-based tokenizer
print(wordpunct_tokenize(text))

['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'that', 'fear', '!', '#', 'Hope', '#', 'Amal', '.', 'M']


In [None]:
#Treebank Word tokenizer
text="What you don't want to be done to yourself, don't do to others..."
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(text))

['What', 'you', 'do', "n't", 'want', 'to', 'be', 'done', 'to', 'yourself', ',', 'do', "n't", 'do', 'to', 'others', '...']


In [None]:
#Tweet tokenizer
tweet= "Don't take cryptocurrency advice from people on Twitter 😅👌"
tokenizer= TweetTokenizer()
print(tokenizer.tokenize(tweet))

["Don't", 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'Twitter', '😅', '👌']


In [None]:
#MWET tokenizer
text= "Hope, is the only thing stronger than fear! Hunger Games #Hope"
tokenizer= MWETokenizer()
print(tokenizer.tokenize(word_tokenize(text)))

['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear', '!', 'Hunger', 'Games', '#', 'Hope']


##TextBlob Word Tokenize

In [None]:
from textblob import TextBlob

In [None]:
text=" But I'm glad you'll see me as I am. Above all, I wouldn't want people to think that I want to prove anything. I don't want to prove anything, I just want to live; to cause no evil to anyone but myself. I have that right, haven't I? Leo Tolstoy "

In [None]:
blob_object= TextBlob(text)
#Word tokenization of  the text
text_words = blob_object.words
#To see all tokens
print(text_words)
#To count the number of tokens
print(len(text_words))

['But', 'I', "'m", 'glad', 'you', "'ll", 'see', 'me', 'as', 'I', 'am', 'Above', 'all', 'I', 'would', "n't", 'want', 'people', 'to', 'think', 'that', 'I', 'want', 'to', 'prove', 'anything', 'I', 'do', "n't", 'want', 'to', 'prove', 'anything', 'I', 'just', 'want', 'to', 'live', 'to', 'cause', 'no', 'evil', 'to', 'anyone', 'but', 'myself', 'I', 'have', 'that', 'right', 'have', "n't", 'I', 'Leo', 'Tolstoy']
55


##spaCy Tokenizer

In [None]:
!pip install spacy
!python3 -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
from spacy.lang.en import English
nlp = English()
text= "All happy families are alike; each unhappy family is unhappy in its own way!!! 👌👌 #Leo Tolstoy"
doc = nlp(text)
for token in doc:
  print(token, token.idx)

All 0
happy 4
families 10
are 19
alike 23
; 28
each 30
unhappy 35
family 43
is 50
unhappy 53
in 61
its 64
own 68
way 72
! 75
! 76
! 77
👌 79
👌 80
# 82
Leo 83
Tolstoy 87


##Gensim Word tokenizer

In [None]:
from gensim.utils import tokenize 
list(tokenize(text))

['All',
 'happy',
 'families',
 'are',
 'alike',
 'each',
 'unhappy',
 'family',
 'is',
 'unhappy',
 'in',
 'its',
 'own',
 'way',
 'Leo',
 'Tolstoy']

##Tokenization with Keras

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

In [None]:
ntoken = Tokenizer(num_words=20)

In [None]:
ntoken.fit_on_texts(text)
list_words = text_to_word_sequence(text)
print(list_words)

['all', 'happy', 'families', 'are', 'alike', 'each', 'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own', 'way', '👌👌', 'leo', 'tolstoy']
