<a href="https://colab.research.google.com/github/Deepjyot-ML-workspace/NLP-Learning/blob/main/TF_IDF_using_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Tf-IDF score using Python

### Prepare Docs

In [7]:
import sys
import re
import math

docs = '''I'd like an apple.
An apple a day keeps the doctor away.
Never compare an apple to an orange.
I prefer scikit-learn to orange.'''

docs = docs.split("\n")

for i,doc in enumerate(docs):
    # print('doc init is: ',doc)
    docs[i] = re.sub(r'[^\w^\s]', '', doc)


print(docs)

['Id like an apple', 'An apple a day keeps the doctor away', 'Never compare an apple to an orange', 'I prefer scikitlearn to orange']


### calculate term frequencies and document-frequency.

In [35]:
from collections import defaultdict

#document frequency
df = defaultdict(int)

# term frequncies
tfs = []


for document in docs:
  words = document.strip('.').split(" ")
  tf = defaultdict(int)
  for w in words:
     
    if w not in tf:
      df[w] += 1
    tf[w] += 1

  tfs.append(tf)

print('TFS: ', tfs)
print('DF: ', df)
print('DF[i',df['Id'])




TFS:  [defaultdict(<class 'int'>, {'Id': 1, 'like': 1, 'an': 1, 'apple': 1}), defaultdict(<class 'int'>, {'An': 1, 'apple': 1, 'a': 1, 'day': 1, 'keeps': 1, 'the': 1, 'doctor': 1, 'away': 1}), defaultdict(<class 'int'>, {'Never': 1, 'compare': 1, 'an': 2, 'apple': 1, 'to': 1, 'orange': 1}), defaultdict(<class 'int'>, {'I': 1, 'prefer': 1, 'scikitlearn': 1, 'to': 1, 'orange': 1})]
DF:  defaultdict(<class 'int'>, {'Id': 1, 'like': 1, 'an': 2, 'apple': 3, 'An': 1, 'a': 1, 'day': 1, 'keeps': 1, 'the': 1, 'doctor': 1, 'away': 1, 'Never': 1, 'compare': 1, 'to': 2, 'orange': 2, 'I': 1, 'prefer': 1, 'scikitlearn': 1})
DF[i 1


### Create Vocab

In [36]:
vocab_id = {v: i for i, v in enumerate(df.keys())}

print(vocab_id)

{'Id': 0, 'like': 1, 'an': 2, 'apple': 3, 'An': 4, 'a': 5, 'day': 6, 'keeps': 7, 'the': 8, 'doctor': 9, 'away': 10, 'Never': 11, 'compare': 12, 'to': 13, 'orange': 14, 'I': 15, 'prefer': 16, 'scikitlearn': 17}


### Docs Vec ( Tf - Idf )

In [37]:
# dimension: no of docs x all words

docs_vec = [[0. for _ in range(len(vocab_id))] for _ in range(len(docs))]

rows  = len(docs_vec)
cols = len(docs_vec[0])

print('Rows, ',rows,' cols: ',cols)

Rows,  4  cols:  18


In [38]:
"""
Populate Tf-IDF vector: docs_vec

Formuala: 
(Term-Frequncy/sum of all terms) * loge(total documents/no of docs with term)
"""
import math

for d in range(len(docs)):
  doc_vec  = docs_vec[d]
  for t,tf in tfs[d].items():
    # print("t is: ",t)
    doc_vec[vocab_id[t]] = (tf*1.0)/sum(tfs[d].values())
    # print('df[t]: ', df[t])
    doc_vec[vocab_id[t]] *= math.log(len(docs)*1.0 / df[t])

t is:  Id
df[t]:  1
t is:  like
df[t]:  1
t is:  an
df[t]:  2
t is:  apple
df[t]:  3
t is:  An
df[t]:  1
t is:  apple
df[t]:  3
t is:  a
df[t]:  1
t is:  day
df[t]:  1
t is:  keeps
df[t]:  1
t is:  the
df[t]:  1
t is:  doctor
df[t]:  1
t is:  away
df[t]:  1
t is:  Never
df[t]:  1
t is:  compare
df[t]:  1
t is:  an
df[t]:  2
t is:  apple
df[t]:  3
t is:  to
df[t]:  2
t is:  orange
df[t]:  2
t is:  I
df[t]:  1
t is:  prefer
df[t]:  1
t is:  scikitlearn
df[t]:  1
t is:  to
df[t]:  2
t is:  orange
df[t]:  2


In [39]:
import pandas as pd

col_names = vocab_id.keys()
row_names = []

for i, d in enumerate(docs):
  row_names.append( f"D{i}" )
# for w,wid in 
df2 = pd.DataFrame(docs_vec, columns=col_names, index=row_names)


print(df2)

          Id      like        an     apple        An         a       day  \
D0  0.346574  0.346574  0.173287  0.071921  0.000000  0.000000  0.000000   
D1  0.000000  0.000000  0.000000  0.035960  0.173287  0.173287  0.173287   
D2  0.000000  0.000000  0.198042  0.041097  0.000000  0.000000  0.000000   
D3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

       keeps       the    doctor      away     Never   compare        to  \
D0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
D1  0.173287  0.173287  0.173287  0.173287  0.000000  0.000000  0.000000   
D2  0.000000  0.000000  0.000000  0.000000  0.198042  0.198042  0.099021   
D3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.138629   

      orange         I    prefer  scikitlearn  
D0  0.000000  0.000000  0.000000     0.000000  
D1  0.000000  0.000000  0.000000     0.000000  
D2  0.099021  0.000000  0.000000     0.000000  
D3  0.138629  0.277259  0.277259     0.277259 