In [1]:
import nltk
#nltk.download()

In [2]:
#Tokenization-
from nltk.tokenize import word_tokenize, sent_tokenize
#Removal of stopwords-
from nltk.corpus import stopwords
#Stemming and Lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
#Vectorization
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#BOW - Box of Words
#Tf - Term Frequency
#IDf - Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfTransformer

# Tokenization

In [3]:
sentences = """Ram was playing cricket and he saw a dog running behind him.
So Shyam started running and went home but Ram kept quiet.
"""

In [4]:
words = word_tokenize(sentences)

In [5]:
print(words)

['Ram', 'was', 'playing', 'cricket', 'and', 'he', 'saw', 'a', 'dog', 'running', 'behind', 'him', '.', 'So', 'Shyam', 'started', 'running', 'and', 'went', 'home', 'but', 'Ram', 'kept', 'quiet', '.']


In [6]:
sentence = sent_tokenize(sentences)

In [7]:
print(sentence)

['Ram was playing cricket and he saw a dog running behind him.', 'So Shyam started running and went home but Ram kept quiet.']


# Removal of Stopwords

In [8]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
stopwords_list = list(stopwords.words('english'))

In [10]:
stopwords_list.extend([',','.'])

In [11]:
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
newWords = []
for word in  words:
    if word not in stopwords_list:
        newWords.append(word)

print(newWords)

['Ram', 'playing', 'cricket', 'saw', 'dog', 'running', 'behind', 'So', 'Shyam', 'started', 'running', 'went', 'home', 'Ram', 'kept', 'quiet']


# Stemming and Lemmatization

In [13]:
stemmedWords = []

In [14]:
p = PorterStemmer()

In [15]:
for word in newWords:
    word1 = p.stem(word)
    stemmedWords.append(word)

In [16]:
print(stemmedWords)

['Ram', 'playing', 'cricket', 'saw', 'dog', 'running', 'behind', 'So', 'Shyam', 'started', 'running', 'went', 'home', 'Ram', 'kept', 'quiet']


In [17]:
lemmatizer = WordNetLemmatizer()

In [18]:
del stemmedWords

In [19]:
stemmedWords = []

In [20]:
for word in newWords:
    word1 = lemmatizer.lemmatize(word,pos='v')
    print("Actual:{} , Stem:{} ".format(word,word1))
    stemmedWords.append(word1)

Actual:Ram , Stem:Ram 
Actual:playing , Stem:play 
Actual:cricket , Stem:cricket 
Actual:saw , Stem:saw 
Actual:dog , Stem:dog 
Actual:running , Stem:run 
Actual:behind , Stem:behind 
Actual:So , Stem:So 
Actual:Shyam , Stem:Shyam 
Actual:started , Stem:start 
Actual:running , Stem:run 
Actual:went , Stem:go 
Actual:home , Stem:home 
Actual:Ram , Stem:Ram 
Actual:kept , Stem:keep 
Actual:quiet , Stem:quiet 


# Vectorization

In [21]:
print(stemmedWords[0])

Ram


In [22]:
import numpy as np

In [23]:
stemmedWords = np.asarray(stemmedWords)
stemmedWords = ' '.join(stemmedWords)
wordsList = []
wordsList.append(stemmedWords)

In [24]:
cv = CountVectorizer()

In [25]:
vector = cv.fit_transform(wordsList)

In [26]:
print(vector)

  (0, 7)	1
  (0, 5)	1
  (0, 4)	1
  (0, 3)	1
  (0, 13)	1
  (0, 11)	1
  (0, 12)	1
  (0, 0)	1
  (0, 9)	2
  (0, 2)	1
  (0, 10)	1
  (0, 1)	1
  (0, 6)	1
  (0, 8)	2


In [27]:
vector.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1]], dtype=int64)

In [28]:
cv.inverse_transform(vector)

[array(['quiet', 'keep', 'home', 'go', 'start', 'shyam', 'so', 'behind',
        'run', 'dog', 'saw', 'cricket', 'play', 'ram'], dtype='<U7')]

In [29]:
cv.vocabulary_

{'ram': 8,
 'play': 6,
 'cricket': 1,
 'saw': 10,
 'dog': 2,
 'run': 9,
 'behind': 0,
 'so': 12,
 'shyam': 11,
 'start': 13,
 'go': 3,
 'home': 4,
 'keep': 5,
 'quiet': 7}

In [30]:
tf = TfidfVectorizer()

In [32]:
vector = tf.fit_transform(wordsList)

In [33]:
vector.toarray()

array([[0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068,
        0.2236068, 0.2236068, 0.4472136, 0.4472136, 0.2236068, 0.2236068,
        0.2236068, 0.2236068]])

In [34]:
tf.inverse_transform(vector)

[array(['ram', 'play', 'cricket', 'saw', 'dog', 'run', 'behind', 'so',
        'shyam', 'start', 'go', 'home', 'keep', 'quiet'], dtype='<U7')]

In [35]:
tft = TfidfTransformer()

In [36]:
vector = tft.fit_transform(vector)

In [37]:
print(vector)

  (0, 7)	0.22360679774997896
  (0, 5)	0.22360679774997896
  (0, 4)	0.22360679774997896
  (0, 3)	0.22360679774997896
  (0, 13)	0.22360679774997896
  (0, 11)	0.22360679774997896
  (0, 12)	0.22360679774997896
  (0, 0)	0.22360679774997896
  (0, 9)	0.4472135954999579
  (0, 2)	0.22360679774997896
  (0, 10)	0.22360679774997896
  (0, 1)	0.22360679774997896
  (0, 6)	0.22360679774997896
  (0, 8)	0.4472135954999579


In [38]:
vector.toarray()

array([[0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068,
        0.2236068, 0.2236068, 0.4472136, 0.4472136, 0.2236068, 0.2236068,
        0.2236068, 0.2236068]])