In [1]:
def tokenize(text:str)->list:
    return text.lower().split()

def count_frequency(tokens:list)->list:
    """
    Input -> Tokens List of words
    Return -> dictionary - {word:freq}
    """
    freq = {}

    for word in tokens:
        if word not in freq:
            freq[word] = 1
        else:
            freq[word] += 1

    return freq

In [2]:

t1 = "I love NLP" 
t2 = "NLP is amazing"
t3 = "This is why nlp is my research interest"
texts = [t1,t2,t3]

text = []
for t in texts :
    text.append(tokenize(t))

# text is list of tokens - list of list
text

[['i', 'love', 'nlp'],
 ['nlp', 'is', 'amazing'],
 ['this', 'is', 'why', 'nlp', 'is', 'my', 'research', 'interest']]

In [3]:

def create_doc(text:list)->dict:
    """
    Input --> list of lists
    returns -> dict of dict
    """
    docs = {}

    # enumerate -> add counter to each token
    for i , tokens in enumerate(text):
        
        freq = count_frequency(tokens)
        docs[i] = freq
    
    return docs

docs = create_doc(text)
docs

{0: {'i': 1, 'love': 1, 'nlp': 1},
 1: {'nlp': 1, 'is': 1, 'amazing': 1},
 2: {'this': 1,
  'is': 2,
  'why': 1,
  'nlp': 1,
  'my': 1,
  'research': 1,
  'interest': 1}}

In [4]:

def represent(docs:dict)->dict:
    '''
    Input --> dict of dict
    return -> dict of list
    '''

    matrix = {"index" : [] 
             }

    # docs - dictionary of dictionary
    # doc - dictionary { word : frequency }
    
    for doc in docs :
        for word in docs[doc]:
            if word not in matrix["index"]:
                matrix["index"].append(word)
    
    for doc in docs :

        # Creates List of Zeros for all documents
        # We will modify this list later
        matrix[f'doc{doc}'] = [0]*(len(matrix['index']))
        
        for word , frequency in docs[doc].items():
            
            # Get index of Word
            # We can replace Frequency of word at that index - In matrix
            index = matrix['index'].index(word)

            # Replace
            matrix[f'doc{doc}'][index] = frequency


    return matrix

matrix = represent(docs)
matrix

{'index': ['i',
  'love',
  'nlp',
  'is',
  'amazing',
  'this',
  'why',
  'my',
  'research',
  'interest'],
 'doc0': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 'doc1': [0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
 'doc2': [0, 0, 1, 2, 0, 1, 1, 1, 1, 1]}

In [5]:
import pandas as pd

df = pd.DataFrame(matrix,index=None)
df.set_index('index' , inplace=True)
df

Unnamed: 0_level_0,doc0,doc1,doc2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
i,1,0,0
love,1,0,0
nlp,1,1,1
is,0,1,2
amazing,0,1,0
this,0,0,1
why,0,0,1
my,0,0,1
research,0,0,1
interest,0,0,1


In [6]:
def TF(term:str , document : list , all_words):

    N = len(document)
    number_term = document[all_words.index(term)]
    
    return number_term / N

In [37]:
tf = TF("research" , matrix['doc2'],matrix['index'])
tf

0.1

In [38]:
import math

def IDF( term , N , matrix ):

    term_index = matrix["index"].index(term)
    d_with_t = 0
    for doc in matrix:
        # Checking if word is present in that list
        if matrix[doc][term_index] != 0:
            d_with_t += 1

    # Due to above condition - if will consider matrix[index] also
    d_with_t = d_with_t-1

    return math.log(N/(d_with_t))

In [39]:
idf = IDF("research" , 3 , matrix)
idf

1.0986122886681098

In [40]:
def tf_idf(tf , idf ):
    return tf*idf

In [41]:
tf_idf(tf,idf)

0.10986122886681099