# Tokenization

In [1]:
from nltk.tokenize import sent_tokenize



In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
text = "Hello everyone. Welcome to the competetion. My favourite students are the ones who win!!!"


In [4]:
sent_tokenize(text)

['Hello everyone.',
 'Welcome to Ethans.',
 'My favourite students are the ones who ask questions!!',
 '!']

# Stemming

In [10]:

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

# Pass set of words
set_of_words = ['run','running','ran','ruining','randomized','afforded']

# choose some words to be stemmed
for s in set_of_words:
  print(s, ':', ps.stem(s))

run : run
running : run
ran : ran
ruining : ruin
randomized : random
afforded : afford


In [8]:
# importing modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()
text_1 = "Hey there, I'm working on stabilizing the drone to have fluent passage above ground level in vision of delivering with quality."
words = word_tokenize(text_1)

for w in words:
    print(w, " : ", ps.stem(w))

Hey  :  hey
there  :  there
,  :  ,
I  :  i
'm  :  'm
working  :  work
on  :  on
stabilizing  :  stabil
the  :  the
drone  :  drone
to  :  to
have  :  have
fluent  :  fluent
passage  :  passag
above  :  abov
ground  :  ground
level  :  level
in  :  in
vision  :  vision
of  :  of
delivering  :  deliv
with  :  with
quality  :  qualiti
.  :  .


# Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
print("capitals :", lemmatizer.lemmatize("capitals"))

# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a"))

rocks : rock
corpora : corpus
capitals : capital
better : good


# StopWords

In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sentense = """This is a sample sentence,
                  showing off the stop words filtration."""

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sentense)
# Converts the words in word_tokens to lower case and then checks whether
# They are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
# With no lower case conversion
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


# TF IDF


# Bag of Words

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
# Multiple documents
text = ["It was the best of times", "it was the worst of times", "it was the age of wisdom", "it was the age of foolishness"]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(sorted(vectorizer.vocabulary_))

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


In [20]:
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(4, 10)
[[0 1 0 1 1 1 1 1 0 0]
 [0 0 0 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 0 1 1 0]
 [1 0 1 1 1 1 0 1 0 0]]


In [21]:
# encode another document
text2 = ["the the the times"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 0 0 0 0 3 1 0 0 0]]


In [None]:
# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["It was the best of times", "it was the worst of times", "it was the age of wisdom", "it was the age of foolishness"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(sorted(vectorizer.vocabulary_))
# encode document
vector = vectorizer.transform([text[0]])

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


In [23]:
print(vectorizer.idf_)

[1.51082562 1.91629073 1.91629073 1.         1.         1.
 1.51082562 1.         1.91629073 1.91629073]


# N_GRAMS

In [24]:
from nltk import ngrams
sentence = 'I reside in Bengaluru.'
n = 2
unigrams = ngrams(sentence.split(), n)
for grams in unigrams:
  print(grams)

('I', 'reside')
('reside', 'in')
('in', 'Bengaluru.')
