# TF-IDF Explaination:



    TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:

    TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

    IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:

    IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as mp
import pandas as pd

In [2]:
data = '''Time flies like an arrow
Fruit flies like a banana,
Sam sat on the cat
The cat is white.'''

print(data)

Time flies like an arrow
Fruit flies like a banana,
Sam sat on the cat
The cat is white.


### Consider each sentence as a document. Split the data into vectors based on new line

In [3]:
dataset = data.split('\n')
dataset

['Time flies like an arrow',
 'Fruit flies like a banana,',
 'Sam sat on the cat',
 'The cat is white.']

### Get the TF matrix.

In [7]:
tf_vectorizer = CountVectorizer(ngram_range=(1,1))
tf = tf_vectorizer.fit_transform(dataset)
print(tf)
type(tf)
type(tf.toarray())

  (0, 1)	1
  (0, 0)	1
  (0, 7)	1
  (0, 4)	1
  (0, 12)	1
  (1, 2)	1
  (1, 5)	1
  (1, 7)	1
  (1, 4)	1
  (2, 3)	1
  (2, 11)	1
  (2, 8)	1
  (2, 10)	1
  (2, 9)	1
  (3, 13)	1
  (3, 6)	1
  (3, 3)	1
  (3, 11)	1


numpy.ndarray

In [8]:
pd.DataFrame(tf.toarray(), columns= tf_vectorizer.get_feature_names())

Unnamed: 0,an,arrow,banana,cat,flies,fruit,is,like,on,sam,sat,the,time,white
0,1,1,0,0,1,0,0,1,0,0,0,0,1,0
1,0,0,1,0,1,1,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,1,1,1,0,0
3,0,0,0,1,0,0,1,0,0,0,0,1,0,1


### Get TF-IDFs.

In [9]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf = tfidf_vectorizer.fit_transform(dataset)
type(tfidf)
print(tfidf)

  (0, 12)	0.4854606118156975
  (0, 4)	0.3827427224171519
  (0, 7)	0.3827427224171519
  (0, 0)	0.4854606118156975
  (0, 1)	0.4854606118156975
  (1, 4)	0.43779123108611473
  (1, 7)	0.43779123108611473
  (1, 5)	0.5552826649411127
  (1, 2)	0.5552826649411127
  (2, 9)	0.4854606118156975
  (2, 10)	0.4854606118156975
  (2, 8)	0.4854606118156975
  (2, 11)	0.3827427224171519
  (2, 3)	0.3827427224171519
  (3, 11)	0.43779123108611473
  (3, 3)	0.43779123108611473
  (3, 6)	0.5552826649411127
  (3, 13)	0.5552826649411127


In [10]:
pd.DataFrame(tfidf.toarray(), columns= tfidf_vectorizer.get_feature_names())

Unnamed: 0,an,arrow,banana,cat,flies,fruit,is,like,on,sam,sat,the,time,white
0,0.485461,0.485461,0.0,0.0,0.382743,0.0,0.0,0.382743,0.0,0.0,0.0,0.0,0.485461,0.0
1,0.0,0.0,0.555283,0.0,0.437791,0.555283,0.0,0.437791,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.382743,0.0,0.0,0.0,0.0,0.485461,0.485461,0.485461,0.382743,0.0,0.0
3,0.0,0.0,0.0,0.437791,0.0,0.0,0.555283,0.0,0.0,0.0,0.0,0.437791,0.0,0.555283


# Manual TF and TF-IDF functions. 

In [11]:
docA = "the cat sat on my sofa"
docB = "the dog sat on my bed" 

### Split document into a vector of words

In [12]:
bowA = docA.split(" ")
bowB = docB.split(" ")
type(bowA)

list

In [13]:
bowA

['the', 'cat', 'sat', 'on', 'my', 'sofa']

### Vocabulary in the corpus

In [14]:
wordSet = set(bowA).union(set(bowB))
wordSet

{'bed', 'cat', 'dog', 'my', 'on', 'sat', 'sofa', 'the'}

### Dictionaries to keep the word count in each bag of words

In [15]:
wordDictA = dict.fromkeys(wordSet,0)
wordDictB = dict.fromkeys(wordSet,0)

In [16]:
wordDictA

{'sat': 0, 'on': 0, 'sofa': 0, 'bed': 0, 'cat': 0, 'my': 0, 'dog': 0, 'the': 0}

In [17]:
# count the frequency of each word in the dictionary
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [18]:
print(wordDictA)
print(wordDictB)

{'sat': 1, 'on': 1, 'sofa': 1, 'bed': 0, 'cat': 1, 'my': 1, 'dog': 0, 'the': 1}
{'sat': 1, 'on': 1, 'sofa': 0, 'bed': 1, 'cat': 0, 'my': 1, 'dog': 1, 'the': 1}


In [19]:
import pandas as pd
#Put them into a matrix
pd.DataFrame([wordDictA,wordDictB])

Unnamed: 0,bed,cat,dog,my,on,sat,sofa,the
0,0,1,0,1,1,1,1,1
1,1,0,1,1,1,1,0,1


### Python function to compute term frequency

In [20]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/ float(bowCount)
    return tfDict

In [21]:
#call the function on 1st document
tfbowA = computeTF(wordDictA,bowA)
tfbowA

{'sat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sofa': 0.16666666666666666,
 'bed': 0.0,
 'cat': 0.16666666666666666,
 'my': 0.16666666666666666,
 'dog': 0.0,
 'the': 0.16666666666666666}

In [22]:
#call the function on 1st document
tfbowB = computeTF(wordDictB,bowB)
tfbowB

{'sat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sofa': 0.0,
 'bed': 0.16666666666666666,
 'cat': 0.0,
 'my': 0.16666666666666666,
 'dog': 0.16666666666666666,
 'the': 0.16666666666666666}

##### Compute IDF = log(no of documents / count(documents in which term T occurs))

In [23]:
import math
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    #count the number of documents that contains the word w
    idfDict = dict.fromkeys(docList[0].keys(),0)
    
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
                
    #divide N by denominator above and take log of that
    for word, val in idfDict.items():
        idfDict[word]= math.log(N/float(val))
        
    return idfDict

In [24]:
### Call ComputeIDF() function, send the documents as a list as input parameters
idfs = computeIDF([wordDictA,wordDictB])
idfs

{'sat': 0.0,
 'on': 0.0,
 'sofa': 0.6931471805599453,
 'bed': 0.6931471805599453,
 'cat': 0.6931471805599453,
 'my': 0.0,
 'dog': 0.6931471805599453,
 'the': 0.0}

### Compute TF * IDF

In [25]:
def computeTFIDF(tfBow,idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

### TFIDF for terms in document A

In [27]:
tfIDFA = computeTFIDF(tfbowA,idfs)
tfIDFA

{'sat': 0.0,
 'on': 0.0,
 'sofa': 0.11552453009332421,
 'bed': 0.0,
 'cat': 0.11552453009332421,
 'my': 0.0,
 'dog': 0.0,
 'the': 0.0}

### TFIDF for terms in document B

In [28]:
tfIDFB = computeTFIDF(tfbowB, idfs)
tfIDFB

{'sat': 0.0,
 'on': 0.0,
 'sofa': 0.0,
 'bed': 0.11552453009332421,
 'cat': 0.0,
 'my': 0.0,
 'dog': 0.11552453009332421,
 'the': 0.0}

In [29]:
pd.DataFrame([tfIDFA,tfIDFB])

Unnamed: 0,bed,cat,dog,my,on,sat,sofa,the
0,0.0,0.115525,0.0,0.0,0.0,0.0,0.115525,0.0
1,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0,0.0


## _Notes_

* https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html

 + highest when $t$ occurs many times within a small number of documents
 + lower when the term occurs fewer times in a document, or occurs in many documents
 + lowest when the term occurs in virtually all documents