# Tokenization

In [1]:
# 1. prefix - character(s) at the begining = $(".)
# 2. suffix - character(s) at the end = km).!"
# 3. Infix character(s) in between = --/... -
# 4. Exception Special-case rule to split a string into several tokens or
#    prevent a token from being split when punctuation rules are applied = Let's

In [2]:
# Approach the tokenization using the split function
# 1. word tokenization
sent1 = "I am going to delhi"
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [4]:
# sentence tokenization
sent2 = "I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great"
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [5]:
# Problems with split function
sent3 = "I am going to delhi!"
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

In [6]:
# in the sent3 problem, there is seeing that! is include with delhi. SO, we so, we have to remove it by r

In [7]:
import re

sent3 = "I am going to delhi"
tokens = re.findall("[\w]+", sent3)
sent3

  tokens = re.findall("[\w]+", sent3)


'I am going to delhi'

In [8]:
# 1. NLTK

In [9]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
sent1 = "I am going to visit delhi"
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi']

In [13]:
sent2 = 'I have a Ph.D. in A.I.'
sent3 = "We 're here to help! mail us at nks@gmail.com "
sent4 = 'A 5km ride cost $10.50'

In [14]:
word_tokenize(sent2)

['I', 'have', 'a', 'Ph.D.', 'in', 'A.I', '.']

In [15]:
#  stemming

In [19]:
# 1. Inflection ==> "In grammar inflection is the modification of a word
#                   to express different grametical categories such as tense, case voice
#                   aspect person, number gender and mood."
# 2. Stemming ==> Stemming is the process of reducing inflection in words to
#                 their root forms such as mapping a group of words to the same stem even if the
#                 stem itslef is not valid word in the Language

In [20]:
from nltk.stem.porter import PorterStemmer

In [21]:
ps = PorterStemmer()

def stem_words (text):
  return " ".join([ps.stem(word) for word in text.split()])

In [22]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [25]:
# stemming mapping a group pf words to the same stem even if the stem itself
# is not valid word in Language. So this is the big issue of stemming .
text = 'probably my alltime favourite movie a story of selflessness sacrifice and dedication.'
print(text)

probably my alltime favourite movie a story of selflessness sacrifice and dedication.


In [26]:
stem_words(text)

'probabl my alltim favourit movi a stori of selfless sacrific and dedication.'

In [27]:
# So , we have to use lemmatization instead of stemming.

# note - stemming is fast and Lemmatization is slow

In [28]:
# Lemmatization unlike Stemming, reduces the inflected words properly
# ensuring that
# the root word belongs to the Language.
# In Lemitization root word is called Lemma.
# Alemma (plural Lemmas or Lemmata) is the canonical form, dictionary form, or
# citation form of a set of words

# Stemming works on beehalf algorithm so it is fast and Lemmatization is using
# wordnet package so it's slow.

In [29]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [30]:
wordnet_lemmatizer = WordNetLemmatizer()

In [31]:
sentence = 'He was running and eating at the same time He has bad habbit of swimming after playing long hours in the sun'
punctuation = '?:!.,;'
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
  sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
  print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
was                 wa                  
and                 and                 
at                  at                  
same                same                
He                  He                  
bad                 bad                 
of                  of                  
after               after               
long                long                
in                  in                  
sun                 sun                 
