In [None]:
'''
1. Extract Sample document and apply following document
preprocessing methods: Tokenization, POS Tagging, stop
words removal, Stemming and Lemmatization.
 2. Create representation of document by calculating Term
Frequency and Inverse Document Frequency.
'''

import nltk
import re

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
text= "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

'''
To read from file-
with open("/content/data.txt","r") as file:
  text = file.read()
print(text)
'''

In [None]:
#Sentence Tokenization
from nltk.tokenize import sent_tokenize
tokenized_text=sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


In [None]:
#Word Tokenization
from nltk.tokenize import word_tokenize
tokenized_words=word_tokenize(text)
print(tokenized_words)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [None]:
#Stop Words Removal
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

{'who', 'm', "aren't", "we've", 'we', "that'll", 'she', "we'd", 'at', 'can', 'and', 'ourselves', 'mustn', "hasn't", 'up', 'has', 'after', 'doing', 'theirs', 'wasn', 'but', 'myself', 'why', 'because', 'only', "won't", 'mightn', "i'd", 'o', 'hasn', "they're", "you've", 'been', 'its', 'of', 'same', 'my', 'yourself', "mustn't", 'nor', 'as', "he's", 'it', 'shan', 'some', 'd', "shan't", 'those', 'don', 'if', 'there', 'you', "shouldn't", 'by', "needn't", 'that', "he'll", 'this', 'during', 'into', 'other', "she'd", "wouldn't", "weren't", 'until', "you'd", 'with', 'off', 'against', 'all', 'what', 'while', 'here', "you're", "didn't", "i'm", 'now', "it'd", 'for', 'our', 'an', 'itself', 'most', 'no', 'few', 'aren', "she's", 'be', 'or', 'too', 'under', 'y', 'more', 'hadn', 'about', 'were', 'a', "you'll", "i'll", 'then', 'he', "haven't", "it'll", 'does', 'needn', 'shouldn', "mightn't", "it's", 'out', 'any', "don't", 'down', 'not', 'me', 'whom', "we're", 'having', 'each', "couldn't", 's', 'the', 'do'

In [None]:
filtered_text=[]
text= "How to remove stop words with NLTK library in Python?"
text=re.sub('[^a-zA-Z]',' ',text)
tokens=word_tokenize(text.lower())
for word in tokens:
  if word not in stop_words:
    filtered_text.append(word)
print(filtered_text)

['remove', 'stop', 'words', 'nltk', 'library', 'python']


In [None]:
#Stemming
from nltk.stem import PorterStemmer
words=['wait','waiting','waited','standing','sit','sitting']
ps=PorterStemmer()
rootword=[]
for w in words:
  if(ps.stem(w) not in rootword):
    rootword.append(ps.stem(w))
print(rootword)

['wait', 'stand', 'sit']


In [None]:
#Lemmanization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer=WordNetLemmatizer()
text = "studies studying cries cry"
words=word_tokenize(text)
rootword=[]
for w in words:
  if(wordnet_lemmatizer.lemmatize(w) not in rootword):
    rootword.append(wordnet_lemmatizer.lemmatize(w))
print(rootword)

study
studying
cry
cry


In [None]:
#POS Tagging
data="The pink sweater fit her perfectly"
words=word_tokenize(data)
for w in words:
  print(nltk.pos_tag([w]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


In [None]:
#TFIDF
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Initialize documents
documentA = 'Jupiter is the largest planet'
documentB = 'Mars is the fourth planet from the Sun'

In [None]:
#Create bag of words
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [None]:
#Create collection of unique words from A and B
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print(uniqueWords)

{'the', 'Mars', 'Jupiter', 'from', 'largest', 'fourth', 'is', 'planet', 'Sun'}


In [None]:
#Creating Dictionary
numOfWordsA = dict.fromkeys(uniqueWords,0)
for w in bagOfWordsA:
  numOfWordsA[w]+=1

numOfWordsB=dict.fromkeys(uniqueWords,0)
for w in bagOfWordsB:
  numOfWordsB[w]+=1

print(numOfWordsA)
print(numOfWordsB)

{'the': 1, 'Mars': 0, 'Jupiter': 1, 'from': 0, 'largest': 1, 'fourth': 0, 'is': 1, 'planet': 1, 'Sun': 0}
{'the': 2, 'Mars': 1, 'Jupiter': 0, 'from': 1, 'largest': 0, 'fourth': 1, 'is': 1, 'planet': 1, 'Sun': 1}


In [None]:
#Compute TF for each document
def computeTF(wordDict,bagOfWords):
  tfDict={}
  bagOfWordsCount=len(bagOfWords)
  for word,count in wordDict.items():
    tfDict[word]=count/float(bagOfWordsCount)
  return tfDict

tfA=computeTF(numOfWordsA,bagOfWordsA)
tfB=computeTF(numOfWordsB,bagOfWordsB)

print(tfA)
print(tfB)

{'the': 0.2, 'Mars': 0.0, 'Jupiter': 0.2, 'from': 0.0, 'largest': 0.2, 'fourth': 0.0, 'is': 0.2, 'planet': 0.2, 'Sun': 0.0}
{'the': 0.25, 'Mars': 0.125, 'Jupiter': 0.0, 'from': 0.125, 'largest': 0.0, 'fourth': 0.125, 'is': 0.125, 'planet': 0.125, 'Sun': 0.125}


In [None]:
#compute IDF
import math
def computeIDF(documents):
  N=len(documents)
  idfDict={}

  # Initialize the IDF dictionary with words from the first document
  for word in documents[0].keys():
    idfDict[word]=0

  # Count how many documents contain each word
  for document in documents:
    for word, val in document.items():
      if(val>0):
        idfDict[word]+=1

  # Now calculate the IDF for each word
  for word, val in idfDict.items():
      idfDict[word] = math.log(N / float(val)) if val > 0 else 0  # Avoid division by zero

  return idfDict

idf=computeIDF([numOfWordsA,numOfWordsB])
print(idf)

{'the': 0.0, 'Mars': 0.6931471805599453, 'Jupiter': 0.6931471805599453, 'from': 0.6931471805599453, 'largest': 0.6931471805599453, 'fourth': 0.6931471805599453, 'is': 0.0, 'planet': 0.0, 'Sun': 0.6931471805599453}


In [None]:
#Compute TFIDF for all words
def computeTFIDF(tfBagOfWords, idfs):
  tfidf = {}
  for word, val in tfBagOfWords.items():
    tfidf[word] = val * idfs[word]
  return tfidf

tfidfA = computeTFIDF(tfA, idf)
tfidfB = computeTFIDF(tfB, idf)
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,the,Mars,Jupiter,from,largest,fourth,is,planet,Sun
0,0.0,0.0,0.138629,0.0,0.138629,0.0,0.0,0.0,0.0
1,0.0,0.086643,0.0,0.086643,0.0,0.086643,0.0,0.0,0.086643
