# Import Libraries

In [2]:
import nltk

# Downloading the required packages


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Initialize the Text


In [47]:
text = "The Universe contains billions of galaxies, each containing millions or billions of stars. The space between the stars and galaxies is largely empty. However, even places far from stars and planets contain scattered particles of dust or a few hydrogen atoms per cubic centimeter."

# Perform Tokenization

## Word Tokenization

In [48]:
from nltk.tokenize import word_tokenize

wordTokenized = word_tokenize(text.lower())
wordTokenized

['the',
 'universe',
 'contains',
 'billions',
 'of',
 'galaxies',
 ',',
 'each',
 'containing',
 'millions',
 'or',
 'billions',
 'of',
 'stars',
 '.',
 'the',
 'space',
 'between',
 'the',
 'stars',
 'and',
 'galaxies',
 'is',
 'largely',
 'empty',
 '.',
 'however',
 ',',
 'even',
 'places',
 'far',
 'from',
 'stars',
 'and',
 'planets',
 'contain',
 'scattered',
 'particles',
 'of',
 'dust',
 'or',
 'a',
 'few',
 'hydrogen',
 'atoms',
 'per',
 'cubic',
 'centimeter',
 '.']

## Sentence Tokenization


In [49]:
from nltk.tokenize import sent_tokenize

sentenceTokenized = sent_tokenize(text.lower())
sentenceTokenized

['the universe contains billions of galaxies, each containing millions or billions of stars.',
 'the space between the stars and galaxies is largely empty.',
 'however, even places far from stars and planets contain scattered particles of dust or a few hydrogen atoms per cubic centimeter.']

# Removing Stop Words and Punctations

In [50]:
from nltk.corpus import stopwords

stop_words=set(stopwords.words("english"))
print(stop_words)

{'haven', 'himself', 'been', 'our', 'further', 'won', 'couldn', 'nor', 'not', 'didn', 'who', 'both', 'other', 'o', 'do', 'more', 'so', "won't", 'itself', "mustn't", 'they', 're', 'then', 'under', 'this', 'them', 'hasn', 'shouldn', 'it', 'their', 'or', 'on', "don't", 'me', 'mightn', 'most', "wasn't", 'themselves', 'over', 'wouldn', 'wasn', 'where', 'some', "you've", 'my', 'any', 'we', 'its', 'by', 'if', 'about', 'no', 'very', 'will', 'm', 'after', "aren't", 'am', 'down', 'aren', 'has', 'll', 'out', "doesn't", 'of', 's', 'just', 'don', 'd', 'ma', "that'll", 'have', 've', 'did', 'all', 'weren', "it's", 'as', 'up', "you're", 'an', 'had', 'i', "weren't", 'you', 'being', 'into', 'here', 'yours', 'only', 'doing', "mightn't", 'yourself', 'in', 'he', 'are', "hadn't", 'y', "she's", 'mustn', 'but', 'hers', 'from', 'against', "you'd", 'shan', 'too', 'ain', "needn't", 't', 'doesn', 'that', 'before', 'ours', 'him', 'hadn', 'herself', 'there', "shan't", "should've", 'were', 'needn', 'why', 'his', 'to

In [51]:
tokens = wordTokenized
punctuations = [".",",","?","!"]
filteredText=[]

for token in tokens:
  if token not in stop_words and token not in punctuations:
    filteredText.append(token)

print("Tokenized Sentence:",tokens)
print("Filterd Sentence:",filteredText)

Tokenized Sentence: ['the', 'universe', 'contains', 'billions', 'of', 'galaxies', ',', 'each', 'containing', 'millions', 'or', 'billions', 'of', 'stars', '.', 'the', 'space', 'between', 'the', 'stars', 'and', 'galaxies', 'is', 'largely', 'empty', '.', 'however', ',', 'even', 'places', 'far', 'from', 'stars', 'and', 'planets', 'contain', 'scattered', 'particles', 'of', 'dust', 'or', 'a', 'few', 'hydrogen', 'atoms', 'per', 'cubic', 'centimeter', '.']
Filterd Sentence: ['universe', 'contains', 'billions', 'galaxies', 'containing', 'millions', 'billions', 'stars', 'space', 'stars', 'galaxies', 'largely', 'empty', 'however', 'even', 'places', 'far', 'stars', 'planets', 'contain', 'scattered', 'particles', 'dust', 'hydrogen', 'atoms', 'per', 'cubic', 'centimeter']


# Perform Stemming

In [52]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

stemmedWords = []

for word in filteredText:
  stemmedWords.append(ps.stem(word))

stemmedWords

['univers',
 'contain',
 'billion',
 'galaxi',
 'contain',
 'million',
 'billion',
 'star',
 'space',
 'star',
 'galaxi',
 'larg',
 'empti',
 'howev',
 'even',
 'place',
 'far',
 'star',
 'planet',
 'contain',
 'scatter',
 'particl',
 'dust',
 'hydrogen',
 'atom',
 'per',
 'cubic',
 'centimet']

# Perform Lammetization

In [53]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lemmatizedWords = []

for word in filteredText:
  lemmatizedWords.append(wordnet_lemmatizer.lemmatize(word))

lemmatizedWords

['universe',
 'contains',
 'billion',
 'galaxy',
 'containing',
 'million',
 'billion',
 'star',
 'space',
 'star',
 'galaxy',
 'largely',
 'empty',
 'however',
 'even',
 'place',
 'far',
 'star',
 'planet',
 'contain',
 'scattered',
 'particle',
 'dust',
 'hydrogen',
 'atom',
 'per',
 'cubic',
 'centimeter']

# Apply POS to text

In [54]:
import nltk

posText = []

for word in filteredText:
  posText.append(nltk.pos_tag([word]))

posText

[[('universe', 'NN')],
 [('contains', 'NNS')],
 [('billions', 'NNS')],
 [('galaxies', 'NNS')],
 [('containing', 'VBG')],
 [('millions', 'NNS')],
 [('billions', 'NNS')],
 [('stars', 'NNS')],
 [('space', 'NN')],
 [('stars', 'NNS')],
 [('galaxies', 'NNS')],
 [('largely', 'RB')],
 [('empty', 'JJ')],
 [('however', 'RB')],
 [('even', 'RB')],
 [('places', 'NNS')],
 [('far', 'RB')],
 [('stars', 'NNS')],
 [('planets', 'NNS')],
 [('contain', 'NN')],
 [('scattered', 'VBN')],
 [('particles', 'NNS')],
 [('dust', 'NN')],
 [('hydrogen', 'NN')],
 [('atoms', 'NNS')],
 [('per', 'IN')],
 [('cubic', 'NN')],
 [('centimeter', 'NN')]]

# Creating representation of document by calculating TFIDF

# Importing Libraries

In [55]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the documents

In [56]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

# Creating Bag of Words for Document A and B

In [57]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [58]:
bagOfWordsA

['Jupiter', 'is', 'the', 'largest', 'Planet']

In [59]:
bagOfWordsB

['Mars', 'is', 'the', 'fourth', 'planet', 'from', 'the', 'Sun']

# Collecting Unique words from Document A and B.

In [60]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'Jupiter',
 'Mars',
 'Planet',
 'Sun',
 'fourth',
 'from',
 'is',
 'largest',
 'planet',
 'the'}

# Creating a dictionary of words and their occurrence for each document in the corpus

In [61]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsA:
  numOfWordsA[word] += 1

numOfWordsA

{'is': 1,
 'planet': 0,
 'Planet': 1,
 'largest': 1,
 'fourth': 0,
 'from': 0,
 'Sun': 0,
 'Jupiter': 1,
 'the': 1,
 'Mars': 0}

In [62]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsB:
  numOfWordsB[word] += 1

numOfWordsB

{'is': 1,
 'planet': 1,
 'Planet': 0,
 'largest': 0,
 'fourth': 1,
 'from': 1,
 'Sun': 1,
 'Jupiter': 0,
 'the': 2,
 'Mars': 1}

# Computing the term frequency for each of our documents

In [63]:
def computeTF(wordDict, bagOfWords):
  tfDict = {}
  bagOfWordsCount = len(bagOfWords)
  for word, count in wordDict.items():
    tfDict[word] = count / float(bagOfWordsCount)
  return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [64]:
tfA

{'is': 0.2,
 'planet': 0.0,
 'Planet': 0.2,
 'largest': 0.2,
 'fourth': 0.0,
 'from': 0.0,
 'Sun': 0.0,
 'Jupiter': 0.2,
 'the': 0.2,
 'Mars': 0.0}

In [65]:
tfB

{'is': 0.125,
 'planet': 0.125,
 'Planet': 0.0,
 'largest': 0.0,
 'fourth': 0.125,
 'from': 0.125,
 'Sun': 0.125,
 'Jupiter': 0.0,
 'the': 0.25,
 'Mars': 0.125}

# Computing the term Inverse Document Frequency

In [66]:
import math

def computeIDF(documents):
  N = len(documents)
  idfDict = dict.fromkeys(documents[0].keys(), 0)
  for document in documents:
    for word, val in document.items():
      if val > 0:
        idfDict[word] += 1

  for word, val in idfDict.items():
    idfDict[word] = math.log(N / float(val))

  return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])

idfs

{'is': 0.0,
 'planet': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'from': 0.6931471805599453,
 'Sun': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'the': 0.0,
 'Mars': 0.6931471805599453}

# Computing the term TF/IDF for all words

In [67]:
def computeTFIDF(tfBagOfWords, idfs):
  tfidf = {}
  for word, val in tfBagOfWords.items():
    tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,is
0,0.0
1,0.0
