In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
from nltk.probability import FreqDist

In [2]:
documentA = 'Jupiter is the largest Planet'
documentB ='Mars is the fourth planet from the Sun'

In [3]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')


In [4]:
uniqueWords = set(bagOfWordsA).union(bagOfWordsB)
uniqueWords


{'Jupiter',
 'Mars',
 'Planet',
 'Sun',
 'fourth',
 'from',
 'is',
 'largest',
 'planet',
 'the'}

In [10]:
dictOfWordsA = dict.fromkeys(uniqueWords,0)
for word in bagOfWordsA:
    dictOfWordsA[word] += 1
dictOfWordsA


{'from': 0,
 'Jupiter': 1,
 'Mars': 0,
 'Planet': 1,
 'planet': 0,
 'the': 1,
 'largest': 1,
 'is': 1,
 'Sun': 0,
 'fourth': 0}

In [9]:
bagOfWordsA_freq = FreqDist(bagOfWordsA)
for word, freq in bagOfWordsA_freq.items():
    print(word, freq)

Jupiter 1
is 1
the 1
largest 1
Planet 1


In [11]:
dictOfWordsB = dict.fromkeys(uniqueWords,0)
for word in bagOfWordsB:
    dictOfWordsB[word] += 1
dictOfWordsB


{'from': 1,
 'Jupiter': 0,
 'Mars': 1,
 'Planet': 0,
 'planet': 1,
 'the': 2,
 'largest': 0,
 'is': 1,
 'Sun': 1,
 'fourth': 1}

In [12]:
def computeTF(wordDict, bagOfWords):
    TfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        TfDict[word] = count / float(bagOfWordsCount)
    return TfDict


In [15]:
TF_of_A = computeTF(dictOfWordsA,bagOfWordsA)
TF_of_B = computeTF(dictOfWordsB,bagOfWordsB)


In [16]:
TF_of_A

{'from': 0.0,
 'Jupiter': 0.2,
 'Mars': 0.0,
 'Planet': 0.2,
 'planet': 0.0,
 'the': 0.2,
 'largest': 0.2,
 'is': 0.2,
 'Sun': 0.0,
 'fourth': 0.0}

In [17]:
TF_of_B

{'from': 0.125,
 'Jupiter': 0.0,
 'Mars': 0.125,
 'Planet': 0.0,
 'planet': 0.125,
 'the': 0.25,
 'largest': 0.0,
 'is': 0.125,
 'Sun': 0.125,
 'fourth': 0.125}

In [18]:
  import math

In [19]:
def computeIDF(documents):
  
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    print(idfDict)
    for word,val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [19]:
print(math.log(100))

4.605170185988092


In [12]:
idfs = computeIDF([numberOfWordsA,numberOfWordsB])
idfs


{'Sun': 1, 'Mars': 1, 'planet': 1, 'is': 2, 'from': 1, 'Planet': 1, 'Jupiter': 1, 'fourth': 1, 'largest': 1, 'the': 2}


{'Sun': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'is': 0.0,
 'from': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'the': 0.0}

In [14]:
def computeTFIDF(tfBagofWords,idfs):
    tfidf={}
    for word, val in tfBagofWords.items():
        tfidf[word]=val*idfs[word]
    return tfidf

In [15]:
tfDifA = computeTFIDF(tfA,idfs)
tfDifB = computeTFIDF(tfB,idfs)

In [16]:
tfDifA


{'Mars': 0.0,
 'from': 0.0,
 'is': 0.0,
 'planet': 0.0,
 'the': 0.0,
 'fourth': 0.0,
 'Jupiter': 0.13862943611198905,
 'Planet': 0.13862943611198905,
 'largest': 0.13862943611198905,
 'Sun': 0.0}

In [17]:
tfDifB


{'Mars': 0.08664339756999316,
 'from': 0.08664339756999316,
 'is': 0.0,
 'planet': 0.08664339756999316,
 'the': 0.0,
 'fourth': 0.08664339756999316,
 'Jupiter': 0.0,
 'Planet': 0.0,
 'largest': 0.0,
 'Sun': 0.08664339756999316}

In [18]:
df=pd.DataFrame([tfDifA,tfDifB])
df

Unnamed: 0,Mars,from,is,planet,the,fourth,Jupiter,Planet,largest,Sun
0,0.0,0.0,0.0,0.0,0.0,0.0,0.138629,0.138629,0.138629,0.0
1,0.086643,0.086643,0.0,0.086643,0.0,0.086643,0.0,0.0,0.0,0.086643


In [18]:
import nltk
from nltk.tokenize import word_tokenize

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog played in the park."
]

# Tokenize the documents
tokenized_documents = [word_tokenize(doc) for doc in documents]

print(tokenized_documents)
# Calculate the TF values
tf_values = []
for doc_tokens in tokenized_documents:
    word_frequency = nltk.FreqDist(doc_tokens)
    total_words = len(doc_tokens)
    tf = {}
    for word, freq in word_frequency.items():
        tf[word] = freq / total_words
    tf_values.append(tf)

# Print the TF values
for doc_index, tf in enumerate(tf_values):
    print("Document:", documents[doc_index])
    for word, value in tf.items():
        print(word, ":", value)
    print()


[['The', 'cat', 'sat', 'on', 'the', 'mat', '.'], ['The', 'dog', 'played', 'in', 'the', 'park', '.']]
Document: The cat sat on the mat.
The : 0.14285714285714285
cat : 0.14285714285714285
sat : 0.14285714285714285
on : 0.14285714285714285
the : 0.14285714285714285
mat : 0.14285714285714285
. : 0.14285714285714285

Document: The dog played in the park.
The : 0.14285714285714285
dog : 0.14285714285714285
played : 0.14285714285714285
in : 0.14285714285714285
the : 0.14285714285714285
park : 0.14285714285714285
. : 0.14285714285714285

