In [1]:
import pandas as pd

In [2]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [3]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [4]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [5]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

#documentA = 'ذهب محمد الي الجامعة ليدرس الفيزياء و الكيمياء'
#documentB = 'ذاكرت مني الرياضيات و الفيزياء في الجامعة'

In [6]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [7]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

In [8]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
print(numOfWordsA, '\n')

for word in bagOfWordsA:
    numOfWordsA[word] += 1 ## Key = Key + 1
    
print(numOfWordsA)

{'sat': 0, 'fire': 0, 'for': 0, 'children': 0, 'man': 0, 'out': 0, 'a': 0, 'walk': 0, 'around': 0, 'the': 0, 'went': 0} 

{'sat': 0, 'fire': 0, 'for': 1, 'children': 0, 'man': 1, 'out': 1, 'a': 1, 'walk': 1, 'around': 0, 'the': 1, 'went': 1}


In [9]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)
print(numOfWordsB, '\n')

for word in bagOfWordsB:
    numOfWordsB[word] += 1 ## Key = Key + 1

print(numOfWordsB)

{'sat': 0, 'fire': 0, 'for': 0, 'children': 0, 'man': 0, 'out': 0, 'a': 0, 'walk': 0, 'around': 0, 'the': 0, 'went': 0} 

{'sat': 1, 'fire': 1, 'for': 0, 'children': 1, 'man': 0, 'out': 0, 'a': 0, 'walk': 0, 'around': 1, 'the': 2, 'went': 0}


In [10]:
tfA = computeTF(numOfWordsA,bagOfWordsA)
tfA

{'sat': 0.0,
 'fire': 0.0,
 'for': 0.14285714285714285,
 'children': 0.0,
 'man': 0.14285714285714285,
 'out': 0.14285714285714285,
 'a': 0.14285714285714285,
 'walk': 0.14285714285714285,
 'around': 0.0,
 'the': 0.14285714285714285,
 'went': 0.14285714285714285}

In [11]:
tfB = computeTF(numOfWordsB, bagOfWordsB)
tfB

{'sat': 0.16666666666666666,
 'fire': 0.16666666666666666,
 'for': 0.0,
 'children': 0.16666666666666666,
 'man': 0.0,
 'out': 0.0,
 'a': 0.0,
 'walk': 0.0,
 'around': 0.16666666666666666,
 'the': 0.3333333333333333,
 'went': 0.0}

In [12]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'sat': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'for': 0.6931471805599453,
 'children': 0.6931471805599453,
 'man': 0.6931471805599453,
 'out': 0.6931471805599453,
 'a': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'around': 0.6931471805599453,
 'the': 0.0,
 'went': 0.6931471805599453}

In [13]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfA

{'sat': 0.0,
 'fire': 0.0,
 'for': 0.09902102579427789,
 'children': 0.0,
 'man': 0.09902102579427789,
 'out': 0.09902102579427789,
 'a': 0.09902102579427789,
 'walk': 0.09902102579427789,
 'around': 0.0,
 'the': 0.0,
 'went': 0.09902102579427789}

In [14]:
tfidfB = computeTFIDF(tfB, idfs)
tfidfB

{'sat': 0.11552453009332421,
 'fire': 0.11552453009332421,
 'for': 0.0,
 'children': 0.11552453009332421,
 'man': 0.0,
 'out': 0.0,
 'a': 0.0,
 'walk': 0.0,
 'around': 0.11552453009332421,
 'the': 0.0,
 'went': 0.0}

In [15]:
df = pd.DataFrame([tfidfA, tfidfB])
df.head()

Unnamed: 0,sat,fire,for,children,man,out,a,walk,around,the,went
0,0.0,0.0,0.099021,0.0,0.099021,0.099021,0.099021,0.099021,0.0,0.0,0.099021
1,0.115525,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0,0.115525,0.0,0.0


In [16]:
## Using Library
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform([documentA, documentB])
print(vectors)

feature_names = vectorizer.get_feature_names()
print(feature_names)

  (0, 8)	0.42615959880289433
  (0, 3)	0.42615959880289433
  (0, 5)	0.42615959880289433
  (0, 9)	0.42615959880289433
  (0, 4)	0.42615959880289433
  (0, 7)	0.3032160644503863
  (1, 2)	0.40740123733358447
  (1, 0)	0.40740123733358447
  (1, 6)	0.40740123733358447
  (1, 1)	0.40740123733358447
  (1, 7)	0.5797386715376657
['around', 'children', 'fire', 'for', 'man', 'out', 'sat', 'the', 'walk', 'went']


In [17]:
dense = vectors.todense()
dense

denselist = dense.tolist()

In [18]:
df = pd.DataFrame(denselist, columns=feature_names)
df.head()

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
