In [3]:
import nltk
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt')

sentence = "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do."

sentence_tokenized = nltk.word_tokenize(sentence)

print(sentence_tokenized)

['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANKIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
paragraph = """
            Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, 
            but it had no pictures or conversations in it, “and what is the use of a book,” thought Alice “without pictures or conversations?”
"""

para_tokenized = nltk.word_tokenize(paragraph)

print (para_tokenized)

['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', '“', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', '”', 'thought', 'Alice', '“', 'without', 'pictures', 'or', 'conversations', '?', '”']


## Normalization

In [5]:
from nltk.corpus import stopwords
# download stop words from nltk library

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

tokens = nltk.word_tokenize(paragraph)

clean_paragraph = [word for word in tokens if word not in stop_words]

try:
    print("tokens are ")
    print(clean_paragraph)
except:
    print("expected variable called clean_sentence")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANKIT\AppData\Roaming\nltk_data...


print tokens 
['Alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', ',', 'nothing', ':', 'twice', 'peeped', 'book', 'sister', 'reading', ',', 'pictures', 'conversations', ',', '“', 'use', 'book', ',', '”', 'thought', 'Alice', '“', 'without', 'pictures', 'conversations', '?', '”']


[nltk_data]   Package stopwords is already up-to-date!


In [6]:
sentence = "Wow this is cool ! Wow this is cool"
lit_sentence = sentence.lower()
upp_sentence = sentence.upper()

print(lit_sentence)
print(upp_sentence)

wow this is cool ! wow this is cool
WOW THIS IS COOL ! WOW THIS IS COOL


## Stemming and Lemmatization

In [7]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

sentence = "There mighe be some sense in your knocking"

tokens = word_tokenize(sentence)
word_stemmed = [stemmer.stem(word) for word in tokens]

print("List of tokens: ")
print(tokens)

print("\n")
print("List of Stemmed World: ")
print(word_stemmed)

List of tokens: 
['There', 'mighe', 'be', 'some', 'sense', 'in', 'your', 'knocking']


List of Stemmed World: 
['there', 'migh', 'be', 'some', 'sens', 'in', 'your', 'knock']


In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')

Lemmatizer = WordNetLemmatizer()
sentence = "There mighe be some sense in your knocking"
tokens = word_tokenize(sentence)

lemma = [Lemmatizer.lemmatize(token) for token in tokens]

print(lemma)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANKIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['There', 'mighe', 'be', 'some', 'sense', 'in', 'your', 'knocking']


## Bag Of Words

In [10]:
corpus = ["Emma was a Catholic because her mother was a Catholic, and Nory’s mother was a Catholic because her father was a Catholic, and her father was a Catholic because his mother was a Catholic, or had been."]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
corpus_vectorizer = CountVectorizer()
corpus_vectorizer.fit(corpus)

CountVectorizer()

In [12]:
#show count for each word present in corpus
corpus_vectorizer.vocabulary_

{'emma': 4,
 'was': 12,
 'catholic': 3,
 'because': 1,
 'her': 7,
 'mother': 9,
 'and': 0,
 'nory': 10,
 'father': 5,
 'his': 8,
 'or': 11,
 'had': 6,
 'been': 2}

In [13]:
corpus = "Emma was a Catholic because her mother was a Catholic, and Nory’s mother was a Catholic because her father was a Catholic, and her father was a Catholic because his mother was a Catholic, or had been."

In [14]:
#Python code without any nlp library
tokens = corpus.split()
#Uncomment to see all the tokens
#print (tokens)
#define empty dictionary 
dic = {}

#save frequency of each token in dictiory
for token in tokens:
  count = 1
  if token not in dic:
    dic[token] = count
  else:
    dic[token] = dic[token] + 1

print (dic)

{'Emma': 1, 'was': 6, 'a': 6, 'Catholic': 3, 'because': 3, 'her': 3, 'mother': 3, 'Catholic,': 3, 'and': 2, 'Nory’s': 1, 'father': 2, 'his': 1, 'or': 1, 'had': 1, 'been.': 1}


## TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

doc_1 = "She is cute"
doc_2 = "He is very cute"
doc_3 = "Angelina is very very cute"

corpus = [doc_1, doc_2, doc_3]

corpus_preprocess = []

for doc in corpus:
  corpus_preprocess.append(doc.lower())

corpus_vectorizer = TfidfVectorizer(norm=None)
tf_idf_scores = corpus_vectorizer.fit_transform(corpus_preprocess)

feature_names = corpus_vectorizer.get_feature_names()
corpus_index = [doc for doc in corpus_preprocess]

print ("TF-IDF table of corpus")
print("\n")

print(pd.DataFrame(tf_idf_scores.T.todense(), index = feature_names, columns = ["doc_1", "doc_2", "doc_3"]))

TF-IDF table of corpus


             doc_1     doc_2     doc_3
angelina  0.000000  0.000000  1.693147
cute      1.000000  1.000000  1.000000
he        0.000000  1.693147  0.000000
is        1.000000  1.000000  1.000000
she       1.693147  0.000000  0.000000
very      0.000000  1.287682  2.575364


In [17]:
#Part 1 - Calculating Term Frequencies
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
term_frequency = vectorizer.fit_transform(corpus_preprocess)

#uncomment following to check feature names
#print(vectorizer.get_feature_names())

print ("Term Frequency Matrix")
print ("=====================")
print (term_frequency.toarray())
print ("=====================")
#Part 2 - Calculating Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = TfidfTransformer(norm=None)
vectorizer.fit(term_frequency)

idf = vectorizer.idf_

df = pd.DataFrame(idf, index = feature_names, columns=['IDF'])
print ("\nIDF table")
print ("=========")
print(df)


tf_idf_scores = vectorizer.fit_transform(term_frequency.toarray())


print ("\nTF-IDF matrix")
print ("=============\n")
print (tf_idf_scores.toarray())

Term Frequency Matrix
[[0 1 0 1 1 0]
 [0 1 1 1 0 1]
 [1 1 0 1 0 2]]

IDF table
               IDF
angelina  1.693147
cute      1.000000
he        1.693147
is        1.000000
she       1.693147
very      1.287682

TF-IDF matrix

[[0.         1.         0.         1.         1.69314718 0.        ]
 [0.         1.         1.69314718 1.         0.         1.28768207]
 [1.69314718 1.         0.         1.         0.         2.57536414]]
