# Tokenization

In [1]:
text= """ India is my country and all Indians are my brothers
and sisters. I love my country and I am proud of its
rich and varied heritage. I shall always strive to be
worthy of it. I shall give respect to my parents,
teachers and elders and treat everyone with
courtesy. To my country and my people, I pledge
my devotion. In their well being and prosperity
alone, lies my happiness."""

1. Tokenization using Python’s split() function

#word tokenization

In [2]:
#split at space
#text.split()

In [3]:
##split at comma
text.split(',')

[' India is my country and all Indians are my brothers\nand sisters. I love my country and I am proud of its\nrich and varied heritage. I shall always strive to be\nworthy of it. I shall give respect to my parents',
 '\nteachers and elders and treat everyone with\ncourtesy. To my country and my people',
 ' I pledge\nmy devotion. In their well being and prosperity\nalone',
 ' lies my happiness.']

#Sentence Tokenization

In [4]:
#split at full stop
text.split('.')

[' India is my country and all Indians are my brothers\nand sisters',
 ' I love my country and I am proud of its\nrich and varied heritage',
 ' I shall always strive to be\nworthy of it',
 ' I shall give respect to my parents,\nteachers and elders and treat everyone with\ncourtesy',
 ' To my country and my people, I pledge\nmy devotion',
 ' In their well being and prosperity\nalone, lies my happiness',
 '']

In [5]:
text=""" # India_  @  is  %% my country and all Indians are my brothers
and sisters. I love my country and I am proud of its
rich and varied heritage. I shall %% always strive to be
worthy of it. I shall give respect to my parents,
teachers and elders and treat everyone with
courtesy.To my country and my people, I pledge
my devotion. In their well being and prosperity
alone, lies my happiness. """

#split at '@' symbol
text.split('@')

[' # India_  ',
 '  is  %% my country and all Indians are my brothers\nand sisters. I love my country and I am proud of its\nrich and varied heritage. I shall %% always strive to be\nworthy of it. I shall give respect to my parents,\nteachers and elders and treat everyone with\ncourtesy.To my country and my people, I pledge\nmy devotion. In their well being and prosperity\nalone, lies my happiness. ']

2. Tokenization using Regular Expressions (RegEx)

#Word Tokenization

In [6]:
import re
text=""" # India_  @  is  %% my country and all Indians are my brothers
and sisters. I love my country and I am proud of its
rich and varied heritage. I shall %% always strive to be
worthy of it. I shall give respect to my parents,
teachers and elders and treat everyone with
courtesy.To my country and my people, I pledge
my devotion. In their well being and prosperity
alone, lies my happiness. """

In [7]:
#- [\%%]+ signals that the code should find all the %% characters until any other character is encountered.
tokens=re.findall("[\%%]+",text)
tokens

['%%', '%%']

In [8]:
#-[\%%]+ signals that the code should find all the %% characters until any other character is encountered.
tokens=re.findall("[\@]+",text)
tokens

['@']

In [9]:
#-[\w’]+ signals that the code should find all the alphanumeric characters until any other character is encountered
#- The “\w” represents “any word character” which usually means alphanumeric (letters, numbers) and underscore (_). ‘+’ means any number of times
tokens=re.findall("[\w]+",text)
#tokens

#Word Tokenization

In [10]:
text= """ India is my country and all Indians are my brothers
and sisters! I love my country and I am proud of its
rich and varied heritage. I shall always strive to be
worthy of it. I shall give respect to my parents?
teachers and elders and treat everyone with
courtesy?? To my country and my people, I pledge
my devotion. In their well/ being and prosperity
alone, lies my happiness.  """

#split the text into sentences by passing a pattern into it.


sentence=re.compile('[.!?/]').split(text)
sentence

[' India is my country and all Indians are my brothers\nand sisters',
 ' I love my country and I am proud of its\nrich and varied heritage',
 ' I shall always strive to be\nworthy of it',
 ' I shall give respect to my parents',
 '\nteachers and elders and treat everyone with\ncourtesy',
 '',
 ' To my country and my people, I pledge\nmy devotion',
 ' In their well',
 ' being and prosperity\nalone, lies my happiness',
 '  ']

3. Tokenization using NLTK

In [11]:
text=""" India is my country and all Indians are my brothers
and sisters. I love my country and I am proud of its
rich and varied heritage. I shall always strive to be
worthy of it. I shall give respect to my parents,
teachers and elders and treat everyone with
courtesy. To my country and my people, I pledge
my devotion. In their well being and prosperity
alone, lies my happiness. """

In [12]:
from nltk.tokenize import word_tokenize
#word_tokenize(text)

In [13]:
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

[' India is my country and all Indians are my brothers\nand sisters.',
 'I love my country and I am proud of its\nrich and varied heritage.',
 'I shall always strive to be\nworthy of it.',
 'I shall give respect to my parents,\nteachers and elders and treat everyone with\ncourtesy.',
 'To my country and my people, I pledge\nmy devotion.',
 'In their well being and prosperity\nalone, lies my happiness.']

# POS Tagging

In [14]:
import nltk
sentence="""ndia is my country and all Indians are my brothers and sisters. I love my country and I am proud of its rich and varied heritage. I shall always strive to be worthy of it. I shall give respect to my parents, teachers and elders and treat everyone with courtesy """
tokens=nltk.word_tokenize(sentence)
tagged=nltk.pos_tag(tokens)
#tagged

In [15]:
from nltk.tag import tnt

In [16]:
from nltk.corpus import indian
train_data=indian.tagged_sents('hindi.pos')

In [17]:
tnt_pos_tagger=tnt.TnT()
tnt_pos_tagger.train(train_data)


In [18]:
words=" भारत हमारा देश है। हम सब भारतवासी भाई-बहन हैं। हमें अपना देश प्राणों से भी प्यारा है। इसकी समृद्धि और विविध संस्कृति पर हमें गर्व है।  हम इसके सुयोग्य अधिकारी बनने का प्रयन्त सदा करते रहेंगे। हम अपने माता-पिता, शिक्षकों और गुरुजनों का आदर करेंगे और सबके साथ शिष्ठता का व्यवहार करेंगे। हम अपने देश और देशवासियों के प्रति वफ़ादार रहने की प्रतिज्ञा करते हैं। उनके कल्याण और समृद्धि में ही हमारा सुख निहित है। जय हिन्द। "
tagged_words=(tnt_pos_tagger.tag(nltk.word_tokenize(words)))
print(tagged_words)

[('भारत', 'NNP'), ('हमारा', 'Unk'), ('देश', 'NN'), ('है।', 'Unk'), ('हम', 'PRP'), ('सब', 'INTF'), ('भारतवासी', 'Unk'), ('भाई-बहन', 'Unk'), ('हैं।', 'Unk'), ('हमें', 'PRP'), ('अपना', 'PRP'), ('देश', 'NN'), ('प्राणों', 'Unk'), ('से', 'PREP'), ('भी', 'RP'), ('प्यारा', 'Unk'), ('है।', 'Unk'), ('इसकी', 'PRP'), ('समृद्धि', 'Unk'), ('और', 'CC'), ('विविध', 'Unk'), ('संस्कृति', 'Unk'), ('पर', 'PREP'), ('हमें', 'PRP'), ('गर्व', 'Unk'), ('है।', 'Unk'), ('हम', 'PRP'), ('इसके', 'PRP'), ('सुयोग्य', 'Unk'), ('अधिकारी', 'NN'), ('बनने', 'Unk'), ('का', 'PREP'), ('प्रयन्त', 'Unk'), ('सदा', 'Unk'), ('करते', 'VJJ'), ('रहेंगे।', 'Unk'), ('हम', 'PRP'), ('अपने', 'PRP'), ('माता-पिता', 'Unk'), (',', 'PUNC'), ('शिक्षकों', 'Unk'), ('और', 'CC'), ('गुरुजनों', 'Unk'), ('का', 'PREP'), ('आदर', 'Unk'), ('करेंगे', 'VFM'), ('और', 'CC'), ('सबके', 'NN'), ('साथ', 'PREP'), ('शिष्ठता', 'Unk'), ('का', 'PREP'), ('व्यवहार', 'Unk'), ('करेंगे।', 'Unk'), ('हम', 'PRP'), ('अपने', 'PRP'), ('देश', 'NN'), ('और', 'CC'), ('देशवासियों', 'U

# Stop words removal

In [19]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#Remove stopwords

In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example=""" India is my country and all Indians are my brothers and sisters. I love my country and I am proud of its rich and varied heritage. I shall always strive to be worthy of it. I shall give respect to my parents, teachers and elders and treat everyone with courtesy. To my country and my people, I pledge my devotion. In their well being and prosperity alone, lies my happiness """
stop_words=set(stopwords.words('english'))
word_tokens=word_tokenize(example)
filtered_sentence=[j for j in word_tokens if not j.lower() in stop_words]
print(filtered_sentence)

['India', 'country', 'Indians', 'brothers', 'sisters', '.', 'love', 'country', 'proud', 'rich', 'varied', 'heritage', '.', 'shall', 'always', 'strive', 'worthy', '.', 'shall', 'give', 'respect', 'parents', ',', 'teachers', 'elders', 'treat', 'everyone', 'courtesy', '.', 'country', 'people', ',', 'pledge', 'devotion', '.', 'well', 'prosperity', 'alone', ',', 'lies', 'happiness']


In [21]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
li=[]
ex=""" My name is Aishwarya .
        I am pursuing my Bachelor of Computer Engineering from AISSMS's Institute of Informaion Technology"""
stop_words=set(stopwords.words('english'))
word_tokens=word_tokenize(ex)
for i in word_tokens:
    if i not in stop_words:
        li.append(i)
        
print(li)

['My', 'name', 'Aishwarya', '.', 'I', 'pursuing', 'Bachelor', 'Computer', 'Engineering', 'AISSMS', "'s", 'Institute', 'Informaion', 'Technology']


# Stemming 

In [22]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
words = ["Relatively", "relate", "realize", "related", "relative"]
 
for w in words:
    print(w, " : ", ps.stem(w))

Relatively  :  rel
relate  :  relat
realize  :  realiz
related  :  relat
relative  :  rel


# Lemmatization

In [23]:
# import these modules
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
 
print("books :", lemmatizer.lemmatize("books"))
print("insertion :", lemmatizer.lemmatize("insertion"))
 
# a denotes adjective in "pos"
print("Bad :", lemmatizer.lemmatize("Bad", pos ="a"))

books : book
insertion : insertion
Bad : Bad
