In [1]:
#v Statistical methods to process data
# TF-IDF > term frequency-inverse document frequency
# TF     > number of times a word appears in a document
# IDF    > log(N/n) where N is the number of documents and n is the number of documents a word appears in


#Cosine similarity

In [2]:
# import nltk

In [3]:
import math
import pandas as pd

In [4]:
ex_sent1 = "in the beginning god created the heavens and the earth and the earth was without form and void"
# ex_sent1 = "in the beginning god created"
ex_sent2 = "and god said let there be light and there was light and god saw that the light was good"

In [5]:
#split
ex_sent1 = ex_sent1.split(" ")
ex_sent2 = ex_sent2.split(" ")

In [6]:
print(ex_sent1)

['in', 'the', 'beginning', 'god', 'created', 'the', 'heavens', 'and', 'the', 'earth', 'and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void']


In [7]:
print(f'first sentence: {ex_sent1}')
print(f'second sentence: {ex_sent2}')

first sentence: ['in', 'the', 'beginning', 'god', 'created', 'the', 'heavens', 'and', 'the', 'earth', 'and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void']
second sentence: ['and', 'god', 'said', 'let', 'there', 'be', 'light', 'and', 'there', 'was', 'light', 'and', 'god', 'saw', 'that', 'the', 'light', 'was', 'good']


In [8]:
#combine 1 & 2 into a list
list1 = set(ex_sent1).union(set(ex_sent2))

#create dictionary)
wDictA = dict.fromkeys(list1, 0)
#                           ^ set default value to 0
wDictB = dict.fromkeys(list1    , 0)

In [9]:
#Assign appropriate values to the dictionary
for word in ex_sent1:
    wDictA[word] += 1
for word in ex_sent2:
    wDictB[word] += 1

In [10]:
wDictA

{'good': 0,
 'god': 1,
 'heavens': 1,
 'saw': 0,
 'in': 1,
 'and': 3,
 'be': 0,
 'light': 0,
 'form': 1,
 'there': 0,
 'let': 0,
 'the': 4,
 'was': 1,
 'beginning': 1,
 'without': 1,
 'created': 1,
 'earth': 2,
 'said': 0,
 'void': 1,
 'that': 0}

In [11]:
#visualize
pd.DataFrame([wDictA, wDictB])

Unnamed: 0,good,god,heavens,saw,in,and,be,light,form,there,let,the,was,beginning,without,created,earth,said,void,that
0,0,1,1,0,1,3,0,0,1,0,0,4,1,1,1,1,2,0,1,0
1,1,2,0,1,0,3,1,3,0,2,1,1,2,0,0,0,0,1,0,1


# TF-DF

In [12]:
def tf(wordDict, doc):
    tfDict = {}
    docCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count / docCount
    return tfDict

In [13]:
tf1 = tf(wDictA, ex_sent1)
tf2 = tf(wDictB, ex_sent2)

In [14]:
pd.DataFrame([tf1, tf2])

Unnamed: 0,good,god,heavens,saw,in,and,be,light,form,there,let,the,was,beginning,without,created,earth,said,void,that
0,0.0,0.055556,0.055556,0.0,0.055556,0.166667,0.0,0.0,0.055556,0.0,0.0,0.222222,0.055556,0.055556,0.055556,0.055556,0.111111,0.0,0.055556,0.0
1,0.052632,0.105263,0.0,0.052632,0.0,0.157895,0.052632,0.157895,0.0,0.105263,0.052632,0.052632,0.105263,0.0,0.0,0.0,0.0,0.052632,0.0,0.052632


In [15]:
# IDF
def idf(docList):
    idfDict = {}
    N = len(docList)

    #                                  v returns a np array
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    
    for word in docList[0].keys():
        for d in docList:
            if d[word] > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log2(N /val) 

    idf_df = pd.DataFrame(list(idfDict.items()), columns=['word', 'idf weight'])

    return idf_df

In [16]:

idf_df = idf([wDictA, wDictB])

In [17]:
def TFIDF(tf, idf):
    tfidf = {}
    
    for word, val in tf.items():
        tfidf[word] = val * idf.loc[idf['word'] == word, 'idf weight'].values[0]
    return tfidf

In [18]:
tfidf1 = TFIDF(tf1, idf_df)
tfidf2 = TFIDF(tf2, idf_df)

pd.DataFrame([tfidf1, tfidf2])

Unnamed: 0,good,god,heavens,saw,in,and,be,light,form,there,let,the,was,beginning,without,created,earth,said,void,that
0,0.0,0.0,0.055556,0.0,0.055556,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.055556,0.055556,0.111111,0.0,0.055556,0.0
1,0.052632,0.0,0.0,0.052632,0.0,0.0,0.052632,0.157895,0.0,0.105263,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.052632
