In [1]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords

In [3]:
documentA="Jupiter is the largest Planet"
documentB="Mars is the fourth planet from the Sun"

In [4]:
tokensA=word_tokenize(documentA)
tokensB=word_tokenize(documentB)
print(tokensA,"\n",tokensB)

['Jupiter', 'is', 'the', 'largest', 'Planet'] 
 ['Mars', 'is', 'the', 'fourth', 'planet', 'from', 'the', 'Sun']


In [6]:
stop_words=set(stopwords.words("english"))
filtered_tokensA=[word for word in tokensA if word.lower() not in stop_words]
print(filtered_tokensA)
filtered_tokensB=[word for word in tokensB if word.lower() not in stop_words]
print(filtered_tokensB)

['Jupiter', 'largest', 'Planet']
['Mars', 'fourth', 'planet', 'Sun']


In [7]:
posA=pos_tag(tokensA)
posB=pos_tag(tokensB)
print(posA,"\n",posB)

[('Jupiter', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('largest', 'JJS'), ('Planet', 'NN')] 
 [('Mars', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('fourth', 'JJ'), ('planet', 'NN'), ('from', 'IN'), ('the', 'DT'), ('Sun', 'NNP')]


In [8]:
stemmer=PorterStemmer()
stemmed_tokensA=[stemmer.stem(word) for word in filtered_tokensA]
stemmed_tokensB=[stemmer.stem(word) for word in filtered_tokensB]
print(stemmed_tokensA,"\n",stemmed_tokensB)

['jupit', 'largest', 'planet'] 
 ['mar', 'fourth', 'planet', 'sun']


In [9]:
lemmatizer=WordNetLemmatizer()
lemmatized_tokensA=[lemmatizer.lemmatize(word) for word in filtered_tokensA]
lemmatized_tokensB=[lemmatizer.lemmatize(word) for word in filtered_tokensB]
print(lemmatized_tokensA,"\n",lemmatized_tokensB)

['Jupiter', 'largest', 'Planet'] 
 ['Mars', 'fourth', 'planet', 'Sun']


In [13]:
preprocessed_textA=" ".join(lemmatized_tokensA)
preprocessed_textB=" ".join(lemmatized_tokensB)
print(preprocessed_textA,"\n",preprocessed_textB)

Jupiter largest Planet 
 Mars fourth planet Sun


In [17]:
bagOfWordsA=documentA.split(" ")
bagOfWordsB=documentB.split(" ")
bagOfWordsA

['Jupiter', 'is', 'the', 'largest', 'Planet']

In [21]:
uniqueWords=set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'Jupiter',
 'Mars',
 'Planet',
 'Sun',
 'fourth',
 'from',
 'is',
 'largest',
 'planet',
 'the'}

In [28]:
numWordsA=dict.fromkeys(uniqueWords,0)
numWordsB=dict.fromkeys(uniqueWords,0)
for word in bagOfWordsA:
    numWordsA[word]+=1
for word in bagOfWordsB:
    numWordsB[word]+=1
print(numWordsA,"\n",numWordsB)

{'Jupiter': 1, 'the': 1, 'planet': 0, 'largest': 1, 'Sun': 0, 'is': 1, 'from': 0, 'fourth': 0, 'Mars': 0, 'Planet': 1} 
 {'Jupiter': 0, 'the': 2, 'planet': 1, 'largest': 0, 'Sun': 1, 'is': 1, 'from': 1, 'fourth': 1, 'Mars': 1, 'Planet': 0}


In [29]:
def computeTF(wordDict,bagOfWords):
    tf={}
    bagOfWordsCount=len(bagOfWords)
    for word,count in wordDict.items():
        tf[word]=count/float(bagOfWordsCount)
    return tf

tfA=computeTF(numWordsA,bagOfWordsA)
tfB=computeTF(numWordsB,bagOfWordsB)
tfA

{'Jupiter': 0.2,
 'the': 0.2,
 'planet': 0.0,
 'largest': 0.2,
 'Sun': 0.0,
 'is': 0.2,
 'from': 0.0,
 'fourth': 0.0,
 'Mars': 0.0,
 'Planet': 0.2}

In [26]:
tfB

{'Jupiter': 0.0,
 'the': 0.25,
 'planet': 0.125,
 'largest': 0.0,
 'Sun': 0.125,
 'is': 0.125,
 'from': 0.125,
 'fourth': 0.125,
 'Mars': 0.125,
 'Planet': 0.0}

In [36]:
def computeIDF(documents):
    import math
    N=len(documents)
    idf=dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word,value in document.items():
            if value>0:
                idf[word]+=1
    for word,value in idf.items():
        idf[word]=math.log(N/float(value))
    return idf

idfs=computeIDF([numWordsA,numWordsB])
idfs

{'Jupiter': 0.6931471805599453,
 'the': 0.0,
 'planet': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'Sun': 0.6931471805599453,
 'is': 0.0,
 'from': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'Planet': 0.6931471805599453}

In [37]:
def computeTFIDF(tf,idf):
    tfidf={}
    for word,value in tf.items():
        tfidf[word]=value*idf[word]
    return tfidf

tfidfA=computeTFIDF(tfA,idfs)
tfidfB=computeTFIDF(tfB,idfs)

In [38]:
df=pd.DataFrame([tfidfA,tfidfB])

In [39]:
df

Unnamed: 0,Jupiter,the,planet,largest,Sun,is,from,fourth,Mars,Planet
0,0.138629,0.0,0.0,0.138629,0.0,0.0,0.0,0.0,0.0,0.138629
1,0.0,0.0,0.086643,0.0,0.086643,0.0,0.086643,0.086643,0.086643,0.0
