In [1]:
import pandas as pd
import numpy as np

In [2]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]

In [3]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

Number of words in the corpus: 14
The words in the corpus: 
 {'analyze', 'science', 'courses', 'scientists', 'important', 'most', 'data', 'one', 'of', 'the', 'best', 'fields', 'this', 'is'}


In [4]:
n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the 

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
df_tf

Unnamed: 0,analyze,science,courses,scientists,important,most,data,one,of,the,best,fields,this,is
0,0.0,0.181818,0.0,0.0,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.0,0.090909,0.0,0.090909
1,0.0,0.111111,0.111111,0.0,0.0,0.0,0.111111,0.111111,0.111111,0.111111,0.111111,0.0,0.111111,0.111111
2,0.25,0.0,0.0,0.25,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
print("IDF of: ")

idf = {}

for w in words_set:
    k = 0    # number of documents in the corpus that contain this word
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k)
    
    print(f'{w:>15}: {idf[w]:>10}' )

IDF of: 
        analyze: 0.47712125471966244
        science: 0.17609125905568124
        courses: 0.47712125471966244
     scientists: 0.47712125471966244
      important: 0.47712125471966244
           most: 0.47712125471966244
           data:        0.0
            one: 0.17609125905568124
             of: 0.17609125905568124
            the: 0.17609125905568124
           best: 0.47712125471966244
         fields: 0.47712125471966244
           this: 0.47712125471966244
             is: 0.17609125905568124


In [6]:
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]
        
df_tf_idf

Unnamed: 0,analyze,science,courses,scientists,important,most,data,one,of,the,best,fields,this,is
0,0.0,0.032017,0.0,0.0,0.043375,0.043375,0.0,0.016008,0.032017,0.016008,0.0,0.043375,0.0,0.016008
1,0.0,0.019566,0.053013,0.0,0.0,0.0,0.0,0.019566,0.019566,0.019566,0.053013,0.0,0.053013,0.019566
2,0.11928,0.0,0.0,0.11928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
