In [60]:
docA = 'the cat sat on my face'
docB = 'the dog sat on my bed'

#docA = 'the man went out for a walk'
#docB = 'the children sat around the fire'

In [61]:
bagOfWordsA = docA.split(' ')
bagOfWordsB = docB.split(' ')

In [62]:
bagOfWordsA

['the', 'cat', 'sat', 'on', 'my', 'face']

In [63]:
bagOfWordsB

['the', 'dog', 'sat', 'on', 'my', 'bed']

In [64]:
uniqueWordSet = set(bagOfWordsA).union(set(bagOfWordsB))

In [65]:
uniqueWordSet

{'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat', 'the'}

In [66]:
wordDictA = dict.fromkeys(uniqueWordSet, 0) 
wordDictB = dict.fromkeys(uniqueWordSet, 0) 

In [67]:
wordDictA

{'dog': 0, 'my': 0, 'the': 0, 'bed': 0, 'face': 0, 'cat': 0, 'on': 0, 'sat': 0}

In [68]:
wordDictB

{'dog': 0, 'my': 0, 'the': 0, 'bed': 0, 'face': 0, 'cat': 0, 'on': 0, 'sat': 0}

In [69]:
for word in bagOfWordsA:
    wordDictA[word]+=1
    
for word in bagOfWordsB:
    wordDictB[word]+=1

In [70]:
wordDictA

{'dog': 0, 'my': 1, 'the': 1, 'bed': 0, 'face': 1, 'cat': 1, 'on': 1, 'sat': 1}

In [71]:
wordDictB

{'dog': 1, 'my': 1, 'the': 1, 'bed': 1, 'face': 0, 'cat': 0, 'on': 1, 'sat': 1}

In [72]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,bed,cat,dog,face,my,on,sat,the
0,0,1,0,1,1,1,1,1
1,1,0,1,0,1,1,1,1


In [87]:
#import nltk
#nltk.download()
#from nltk.corpus import stopwords
#stopwords.words('english')

In [88]:
def computeTF(wordDict, BoW):
    tfDict = {}
    bowCount = len(BoW)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [89]:
tfBowA = computeTF(wordDictA, bagOfWordsA)
tfBowB = computeTF(wordDictB, bagOfWordsB)

In [90]:
tfBowA

{'dog': 0.0,
 'my': 0.16666666666666666,
 'the': 0.16666666666666666,
 'bed': 0.0,
 'face': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [81]:
tfBowB

{'dog': 0.16666666666666666,
 'my': 0.16666666666666666,
 'the': 0.16666666666666666,
 'bed': 0.16666666666666666,
 'face': 0.0,
 'cat': 0.0,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [82]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [83]:
idfs = computeIDF([wordDictA, wordDictB])

In [84]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [85]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [86]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,bed,cat,dog,face,my,on,sat,the
0,0.0,0.050172,0.0,0.050172,0.0,0.0,0.0,0.0
1,0.050172,0.0,0.050172,0.0,0.0,0.0,0.0,0.0


## Implementing TF-IDF Provided by sklearn.

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([docA, docB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [95]:
df

Unnamed: 0,bed,cat,dog,face,my,on,sat,the
0,0.0,0.498446,0.0,0.498446,0.354649,0.354649,0.354649,0.354649
1,0.498446,0.0,0.498446,0.0,0.354649,0.354649,0.354649,0.354649


The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations. In an example with more text, the score for the word the would be greatly reduced.

In [96]:
print(vectors.shape)

(2, 8)


In [97]:
print(vectorizer.get_feature_names())

['bed', 'cat', 'dog', 'face', 'my', 'on', 'sat', 'the']


In [98]:
for col in vectors.nonzero()[1]:
    print (feature_names[col], ' - ', vectors[0, col])

the  -  0.35464863330313684
cat  -  0.49844627974580596
sat  -  0.35464863330313684
on  -  0.35464863330313684
my  -  0.35464863330313684
face  -  0.49844627974580596
the  -  0.35464863330313684
sat  -  0.35464863330313684
on  -  0.35464863330313684
my  -  0.35464863330313684
dog  -  0.0
bed  -  0.0


In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [103]:
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]

In [104]:
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [105]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [106]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [107]:
print(X.shape)

(4, 9)
