<a href="https://colab.research.google.com/github/Avishek2020/nlp-notebooks/blob/master/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Introduction: TF-IDF

Reference :- https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

TF-IDF stands for “Term Frequency — Inverse Document Frequency”.

TF refer as Term Frequency and is calculated as

`TF = (Frequency of the word in the sentence) / (Total number of words in the sentence)`

IDF refers to inverse document frequency and is calculated as 

`IDF: (Total number of sentences (documents))/(Number of sentences (documents) containing the word)`

In [0]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
class tfidf:
    def __init__(self,worddict,bagOfWords, documents):
        self.worddict   = worddict
        self.bagOfWords = bagOfWords
        self.documents  = documents
    
    def tf(self):
        tfDict ={}
        totalWordsCount = len(self.bagOfWords)
        for word, count in self.worddict.items():
            tfDict[word] = count/ float(totalWordsCount)
        return tfDict

    def idf(self):
        import math
        N =  len(self.documents)          
        print(N)
        idfDict = dict.fromkeys(self.documents[0].keys(), 0)
        for document in self.documents:
            for word, val in document.items():
                if val > 0:
                    idfDict[word] += 1
        
        for word, val in idfDict.items():
            idfDict[word] = math.log(N / float(val))
        return idfDict


In [0]:
 Stanza1 = "Two roads diverged in a yellow wood,"\
           "And sorry I could not travel both "\
           "And be one traveler, long I stood "\
           "And looked down one as far as I could "\
           "To where it bent in the undergrowth "
Stanza2 =   "Then took the other, as just as fair ,"\
            "And having perhaps the better claim ,"\
            "Because it was grassy and wanted wear ;"\
            "Though as for that the passing there "\
            "Had worn them really about the same "

In [503]:
bagOfWordsStanza1 = Stanza1.split(' ')
bagOfWordsStanza1 , len(bagOfWordsStanza1)

(['Two',
  'roads',
  'diverged',
  'in',
  'a',
  'yellow',
  'wood,And',
  'sorry',
  'I',
  'could',
  'not',
  'travel',
  'both',
  'And',
  'be',
  'one',
  'traveler,',
  'long',
  'I',
  'stood',
  'And',
  'looked',
  'down',
  'one',
  'as',
  'far',
  'as',
  'I',
  'could',
  'To',
  'where',
  'it',
  'bent',
  'in',
  'the',
  'undergrowth',
  ''],
 37)

In [504]:
bagOfWordsStanza2 = Stanza2.split(' ')
bagOfWordsStanza2 ,len(bagOfWordsStanza2)

(['Then',
  'took',
  'the',
  'other,',
  'as',
  'just',
  'as',
  'fair',
  ',And',
  'having',
  'perhaps',
  'the',
  'better',
  'claim',
  ',Because',
  'it',
  'was',
  'grassy',
  'and',
  'wanted',
  'wear',
  ';Though',
  'as',
  'for',
  'that',
  'the',
  'passing',
  'there',
  'Had',
  'worn',
  'them',
  'really',
  'about',
  'the',
  'same',
  ''],
 36)

In [505]:
uniqueWords = set(bagOfWordsStanza1).union(set(bagOfWordsStanza2))
uniqueWords , len(uniqueWords)

({'',
  ',And',
  ',Because',
  ';Though',
  'And',
  'Had',
  'I',
  'Then',
  'To',
  'Two',
  'a',
  'about',
  'and',
  'as',
  'be',
  'bent',
  'better',
  'both',
  'claim',
  'could',
  'diverged',
  'down',
  'fair',
  'far',
  'for',
  'grassy',
  'having',
  'in',
  'it',
  'just',
  'long',
  'looked',
  'not',
  'one',
  'other,',
  'passing',
  'perhaps',
  'really',
  'roads',
  'same',
  'sorry',
  'stood',
  'that',
  'the',
  'them',
  'there',
  'took',
  'travel',
  'traveler,',
  'undergrowth',
  'wanted',
  'was',
  'wear',
  'where',
  'wood,And',
  'worn',
  'yellow'},
 57)

##### Now we need to create dictionary of words and there occurence

In [506]:
# dict.fromkeys() creates a new dictionary with keys from seq and values set to value.
# seq- list of seq to be used for dictionary 
# value - is optional
dictWordsStanza1 = dict.fromkeys(uniqueWords,0)
dictWordsStanza1
# occurence of words
for word in bagOfWordsStanza1:
    dictWordsStanza1[word] += 1
dictWordsStanza1

{'': 1,
 ',And': 0,
 ',Because': 0,
 ';Though': 0,
 'And': 2,
 'Had': 0,
 'I': 3,
 'Then': 0,
 'To': 1,
 'Two': 1,
 'a': 1,
 'about': 0,
 'and': 0,
 'as': 2,
 'be': 1,
 'bent': 1,
 'better': 0,
 'both': 1,
 'claim': 0,
 'could': 2,
 'diverged': 1,
 'down': 1,
 'fair': 0,
 'far': 1,
 'for': 0,
 'grassy': 0,
 'having': 0,
 'in': 2,
 'it': 1,
 'just': 0,
 'long': 1,
 'looked': 1,
 'not': 1,
 'one': 2,
 'other,': 0,
 'passing': 0,
 'perhaps': 0,
 'really': 0,
 'roads': 1,
 'same': 0,
 'sorry': 1,
 'stood': 1,
 'that': 0,
 'the': 1,
 'them': 0,
 'there': 0,
 'took': 0,
 'travel': 1,
 'traveler,': 1,
 'undergrowth': 1,
 'wanted': 0,
 'was': 0,
 'wear': 0,
 'where': 1,
 'wood,And': 1,
 'worn': 0,
 'yellow': 1}

In [507]:
dictWordsStanza2 = dict.fromkeys(uniqueWords,0)
dictWordsStanza2
for word in bagOfWordsStanza2:
    dictWordsStanza2[word] += 1
dictWordsStanza2

{'': 1,
 ',And': 1,
 ',Because': 1,
 ';Though': 1,
 'And': 0,
 'Had': 1,
 'I': 0,
 'Then': 1,
 'To': 0,
 'Two': 0,
 'a': 0,
 'about': 1,
 'and': 1,
 'as': 3,
 'be': 0,
 'bent': 0,
 'better': 1,
 'both': 0,
 'claim': 1,
 'could': 0,
 'diverged': 0,
 'down': 0,
 'fair': 1,
 'far': 0,
 'for': 1,
 'grassy': 1,
 'having': 1,
 'in': 0,
 'it': 1,
 'just': 1,
 'long': 0,
 'looked': 0,
 'not': 0,
 'one': 0,
 'other,': 1,
 'passing': 1,
 'perhaps': 1,
 'really': 1,
 'roads': 0,
 'same': 1,
 'sorry': 0,
 'stood': 0,
 'that': 1,
 'the': 4,
 'them': 1,
 'there': 1,
 'took': 1,
 'travel': 0,
 'traveler,': 0,
 'undergrowth': 0,
 'wanted': 1,
 'was': 1,
 'wear': 1,
 'where': 0,
 'wood,And': 0,
 'worn': 1,
 'yellow': 0}

In [508]:
# To calculate Tf for Stanza 1 &2

Obj_Stanza1 = tfidf(dictWordsStanza1,bagOfWordsStanza1, None)
tf1 = Obj_Stanza1.tf()
Obj_Stanza2 = tfidf(dictWordsStanza2,bagOfWordsStanza2, None)
tf2 = Obj_Stanza2.tf()
tf2

{'': 0.027777777777777776,
 ',And': 0.027777777777777776,
 ',Because': 0.027777777777777776,
 ';Though': 0.027777777777777776,
 'And': 0.0,
 'Had': 0.027777777777777776,
 'I': 0.0,
 'Then': 0.027777777777777776,
 'To': 0.0,
 'Two': 0.0,
 'a': 0.0,
 'about': 0.027777777777777776,
 'and': 0.027777777777777776,
 'as': 0.08333333333333333,
 'be': 0.0,
 'bent': 0.0,
 'better': 0.027777777777777776,
 'both': 0.0,
 'claim': 0.027777777777777776,
 'could': 0.0,
 'diverged': 0.0,
 'down': 0.0,
 'fair': 0.027777777777777776,
 'far': 0.0,
 'for': 0.027777777777777776,
 'grassy': 0.027777777777777776,
 'having': 0.027777777777777776,
 'in': 0.0,
 'it': 0.027777777777777776,
 'just': 0.027777777777777776,
 'long': 0.0,
 'looked': 0.0,
 'not': 0.0,
 'one': 0.0,
 'other,': 0.027777777777777776,
 'passing': 0.027777777777777776,
 'perhaps': 0.027777777777777776,
 'really': 0.027777777777777776,
 'roads': 0.0,
 'same': 0.027777777777777776,
 'sorry': 0.0,
 'stood': 0.0,
 'that': 0.027777777777777776,
 

In [509]:
# To calculate idf for stanza1 & 2
documents = ([dictWordsStanza1,dictWordsStanza2])
Obj_Stanza12 = tfidf(dictWordsStanza1,bagOfWordsStanza1, documents)
idfs = Obj_Stanza12.idf()
idfs

2


{'': 0.0,
 ',And': 0.6931471805599453,
 ',Because': 0.6931471805599453,
 ';Though': 0.6931471805599453,
 'And': 0.6931471805599453,
 'Had': 0.6931471805599453,
 'I': 0.6931471805599453,
 'Then': 0.6931471805599453,
 'To': 0.6931471805599453,
 'Two': 0.6931471805599453,
 'a': 0.6931471805599453,
 'about': 0.6931471805599453,
 'and': 0.6931471805599453,
 'as': 0.0,
 'be': 0.6931471805599453,
 'bent': 0.6931471805599453,
 'better': 0.6931471805599453,
 'both': 0.6931471805599453,
 'claim': 0.6931471805599453,
 'could': 0.6931471805599453,
 'diverged': 0.6931471805599453,
 'down': 0.6931471805599453,
 'fair': 0.6931471805599453,
 'far': 0.6931471805599453,
 'for': 0.6931471805599453,
 'grassy': 0.6931471805599453,
 'having': 0.6931471805599453,
 'in': 0.6931471805599453,
 'it': 0.0,
 'just': 0.6931471805599453,
 'long': 0.6931471805599453,
 'looked': 0.6931471805599453,
 'not': 0.6931471805599453,
 'one': 0.6931471805599453,
 'other,': 0.6931471805599453,
 'passing': 0.6931471805599453,
 '

In [0]:
# Now we have TF and IDF, by multiplying both we get TF-IDF

def computeTFIDF(tfBagsofWords, idfs):
    tfidf = {}
    for word, val in tfBagsofWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf




In [511]:
tfidfStanza1 = computeTFIDF(tf1, idfs)
tfidfStanza1
tfidfStanza2 = computeTFIDF(tf2, idfs)
tfidfStanza2

{'': 0.0,
 ',And': 0.01925408834888737,
 ',Because': 0.01925408834888737,
 ';Though': 0.01925408834888737,
 'And': 0.0,
 'Had': 0.01925408834888737,
 'I': 0.0,
 'Then': 0.01925408834888737,
 'To': 0.0,
 'Two': 0.0,
 'a': 0.0,
 'about': 0.01925408834888737,
 'and': 0.01925408834888737,
 'as': 0.0,
 'be': 0.0,
 'bent': 0.0,
 'better': 0.01925408834888737,
 'both': 0.0,
 'claim': 0.01925408834888737,
 'could': 0.0,
 'diverged': 0.0,
 'down': 0.0,
 'fair': 0.01925408834888737,
 'far': 0.0,
 'for': 0.01925408834888737,
 'grassy': 0.01925408834888737,
 'having': 0.01925408834888737,
 'in': 0.0,
 'it': 0.0,
 'just': 0.01925408834888737,
 'long': 0.0,
 'looked': 0.0,
 'not': 0.0,
 'one': 0.0,
 'other,': 0.01925408834888737,
 'passing': 0.01925408834888737,
 'perhaps': 0.01925408834888737,
 'really': 0.01925408834888737,
 'roads': 0.0,
 'same': 0.01925408834888737,
 'sorry': 0.0,
 'stood': 0.0,
 'that': 0.01925408834888737,
 'the': 0.0,
 'them': 0.01925408834888737,
 'there': 0.0192540883488873

In [0]:
df = pd.DataFrame([tfidfStanza1,tfidfStanza2])

In [515]:
df

Unnamed: 0,Unnamed: 1,"other,",yellow,worn,Had,stood,both,looked,one,Two,",Because",took,sorry,as,a,down,diverged,passing,perhaps,undergrowth,for,And,travel,claim,I,wanted,bent,was,fair,just,roads,Then,wear,could,"wood,And",better,about,be,really,;Though,the,long,"traveler,",To,and,them,in,not,grassy,",And",same,that,there,it,far,where,having
0,0.0,0.0,0.018734,0.0,0.0,0.018734,0.018734,0.018734,0.037467,0.018734,0.0,0.0,0.018734,0.0,0.018734,0.018734,0.018734,0.0,0.0,0.018734,0.0,0.037467,0.018734,0.0,0.056201,0.0,0.018734,0.0,0.0,0.0,0.018734,0.0,0.0,0.037467,0.018734,0.0,0.0,0.018734,0.0,0.0,0.0,0.018734,0.018734,0.018734,0.0,0.0,0.037467,0.018734,0.0,0.0,0.0,0.0,0.0,0.0,0.018734,0.018734,0.0
1,0.0,0.019254,0.0,0.019254,0.019254,0.0,0.0,0.0,0.0,0.0,0.019254,0.019254,0.0,0.0,0.0,0.0,0.0,0.019254,0.019254,0.0,0.019254,0.0,0.0,0.019254,0.0,0.019254,0.0,0.019254,0.019254,0.019254,0.0,0.019254,0.019254,0.0,0.0,0.019254,0.019254,0.0,0.019254,0.019254,0.0,0.0,0.0,0.0,0.019254,0.019254,0.0,0.0,0.019254,0.019254,0.019254,0.019254,0.019254,0.0,0.0,0.0,0.019254
