In [1]:
import numpy as np
import pandas as pd
import nltk


In [2]:
# nltk.download('all')

# Different Tokenizers

In [3]:
sentence = "If you want to know what a man’s like, take a good looks at how he treats his inferiors, not his equals. It is our choices, Harry, that show what we truly are, far more than our abilities!"

## 1. WhiteSpace Tokenization

In [4]:
from nltk.tokenize import WhitespaceTokenizer
whitespace_tokenized = WhitespaceTokenizer().tokenize(sentence)
print(whitespace_tokenized)


['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man’s', 'like,', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors,', 'not', 'his', 'equals.', 'It', 'is', 'our', 'choices,', 'Harry,', 'that', 'show', 'what', 'we', 'truly', 'are,', 'far', 'more', 'than', 'our', 'abilities!']


## 2. TreeBankWord Tokenization

In [5]:
from nltk.tokenize import TreebankWordTokenizer
treebank_tokenized = TreebankWordTokenizer().tokenize(sentence)
print(treebank_tokenized)

['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man’s', 'like', ',', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals.', 'It', 'is', 'our', 'choices', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '!']


## 3. MWE Tokenization

In [6]:
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer([('he'),('take','a','good')])
mwe_tokenizer.add_mwe(('Harry'))
mwe_tokenized = mwe_tokenizer.tokenize(sentence)
print(mwe_tokenized)

['I', 'f', ' ', 'y', 'o', 'u', ' ', 'w', 'a', 'n', 't', ' ', 't', 'o', ' ', 'k', 'n', 'o', 'w', ' ', 'w', 'h', 'a', 't', ' ', 'a', ' ', 'm', 'a', 'n', '’', 's', ' ', 'l', 'i', 'k', 'e', ',', ' ', 't', 'a', 'k', 'e', ' ', 'a', ' ', 'g', 'o', 'o', 'd', ' ', 'l', 'o', 'o', 'k', 's', ' ', 'a', 't', ' ', 'h', 'o', 'w', ' ', 'h_e', ' ', 't', 'r', 'e', 'a', 't', 's', ' ', 'h', 'i', 's', ' ', 'i', 'n', 'f', 'e', 'r', 'i', 'o', 'r', 's', ',', ' ', 'n', 'o', 't', ' ', 'h', 'i', 's', ' ', 'e', 'q', 'u', 'a', 'l', 's', '.', ' ', 'I', 't', ' ', 'i', 's', ' ', 'o', 'u', 'r', ' ', 'c', 'h', 'o', 'i', 'c', 'e', 's', ',', ' ', 'H_a_r_r_y', ',', ' ', 't', 'h', 'a', 't', ' ', 's', 'h', 'o', 'w', ' ', 'w', 'h', 'a', 't', ' ', 'w', 'e', ' ', 't', 'r', 'u', 'l', 'y', ' ', 'a', 'r', 'e', ',', ' ', 'f', 'a', 'r', ' ', 'm', 'o', 'r', 'e', ' ', 't', 'h', 'a', 'n', ' ', 'o', 'u', 'r', ' ', 'a', 'b', 'i', 'l', 'i', 't', 'i', 'e', 's', '!']


## 4. Tweet Tokenization

In [7]:
from nltk.tokenize import TweetTokenizer
tweet_tokenized = TweetTokenizer().tokenize(sentence)
print(tweet_tokenized)

['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals', '.', 'It', 'is', 'our', 'choices', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '!']


## 5. Punctuation Based Word Tokenization

In [8]:
from nltk.tokenize import wordpunct_tokenize
print(wordpunct_tokenize(sentence))

['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals', '.', 'It', 'is', 'our', 'choices', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '!']


## 6. Punctuation Based Sentence Tokenization

In [9]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sent_tokenized = sent_detector.tokenize(sentence)
print(sent_tokenized)

['If you want to know what a man’s like, take a good looks at how he treats his inferiors, not his equals.', 'It is our choices, Harry, that show what we truly are, far more than our abilities!']


# Different Stemming Techniques

## 1. SnowBallStemming

In [10]:
from nltk.stem.snowball import SnowballStemmer
snowballstemmer = SnowballStemmer(language='english')
stem_words = []
for w in tweet_tokenized:
    x = snowballstemmer.stem(w)
    stem_words.append(x)
print('Before Snowball Stemming')
print(tweet_tokenized,'\n')

print('After Snowball Stemming')
print(stem_words)


Before Snowball Stemming
['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals', '.', 'It', 'is', 'our', 'choices', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '!'] 

After Snowball Stemming
['if', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'look', 'at', 'how', 'he', 'treat', 'his', 'inferior', ',', 'not', 'his', 'equal', '.', 'it', 'is', 'our', 'choic', ',', 'harri', ',', 'that', 'show', 'what', 'we', 'truli', 'are', ',', 'far', 'more', 'than', 'our', 'abil', '!']


## 2. Porter Stemming

In [11]:
from nltk.stem import PorterStemmer
porterstemmer = PorterStemmer()
stem_words = []
for w in tweet_tokenized:
    x = porterstemmer.stem(w)
    stem_words.append(x)
print('Before Potter Stemming')
print(tweet_tokenized,'\n')

print('After Potter Stemming')
print(stem_words)


Before Potter Stemming
['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals', '.', 'It', 'is', 'our', 'choices', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '!'] 

After Potter Stemming
['if', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'look', 'at', 'how', 'he', 'treat', 'hi', 'inferior', ',', 'not', 'hi', 'equal', '.', 'it', 'is', 'our', 'choic', ',', 'harri', ',', 'that', 'show', 'what', 'we', 'truli', 'are', ',', 'far', 'more', 'than', 'our', 'abil', '!']


# Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
stem_words = []
for w in tweet_tokenized:
    x = wordnet_lemmatizer.lemmatize(w)
    stem_words.append(x)
print('Before Lemmatization')
print(tweet_tokenized,'\n')

print('After Lemmatization')
print(stem_words)

Before Lemmatization
['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'looks', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals', '.', 'It', 'is', 'our', 'choices', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '!'] 

After Lemmatization
['If', 'you', 'want', 'to', 'know', 'what', 'a', 'man', '’', 's', 'like', ',', 'take', 'a', 'good', 'look', 'at', 'how', 'he', 'treat', 'his', 'inferior', ',', 'not', 'his', 'equal', '.', 'It', 'is', 'our', 'choice', ',', 'Harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'ability', '!']
