# Week-10

# Aim:- Write a program to implement TF-IDF for any corpus.

In [2]:
import math 
import nltk 
corpus = ["This is the first document.", 
          "This document is the second document.", 
          "And this is the third one.", 
          "Is this the first document?"] 

In [3]:
# Tokenize the corpus 
tokenized_corpus = [nltk.word_tokenize(doc.lower()) for doc in corpus] 
tokenized_corpus

[['this', 'is', 'the', 'first', 'document', '.'],
 ['this', 'document', 'is', 'the', 'second', 'document', '.'],
 ['and', 'this', 'is', 'the', 'third', 'one', '.'],
 ['is', 'this', 'the', 'first', 'document', '?']]

In [4]:
# Calculate document frequency (df) for each term in the corpus 
df = {} 
for doc in tokenized_corpus: 
    for term in set(doc): 
        if term not in df: 
            df[term] = 1 
        else: 
            df[term] += 1
print(df)

{'is': 4, 'document': 3, '.': 3, 'this': 4, 'the': 4, 'first': 2, 'second': 1, 'third': 1, 'and': 1, 'one': 1, '?': 1}


In [10]:
# Calculate inverse document frequency (idf) for each term in the corpus 
N = len(tokenized_corpus) 
print(N)
idf = {} 
for term in df: 
    idf[term] = math.log(N / df[term])
print(idf)

4
{'is': 0.0, 'document': 0.28768207245178085, '.': 0.28768207245178085, 'this': 0.0, 'the': 0.0, 'first': 0.6931471805599453, 'second': 1.3862943611198906, 'third': 1.3862943611198906, 'and': 1.3862943611198906, 'one': 1.3862943611198906, '?': 1.3862943611198906}


In [6]:
# Calculate term frequency (tf) for each term in each document 
tf = {} 
for i, doc in enumerate(tokenized_corpus): 
    tf[i] = {} 
    for term in doc: 
        if term not in tf[i]: 
            tf[i][term] = 1 
        else: 
            tf[i][term] += 1
print(tf)

{0: {'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1, '.': 1}, 1: {'this': 1, 'document': 2, 'is': 1, 'the': 1, 'second': 1, '.': 1}, 2: {'and': 1, 'this': 1, 'is': 1, 'the': 1, 'third': 1, 'one': 1, '.': 1}, 3: {'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1, '?': 1}}


In [7]:
# Calculate TF-IDF score for each term in each document 
tfidf = {} 
for i, doc in enumerate(tokenized_corpus): 
    tfidf[i] = {} 
    for term in doc: 
        tfidf[i][term] = tf[i][term] * idf[term] 
print(tfidf)

{0: {'this': 0.0, 'is': 0.0, 'the': 0.0, 'first': 0.6931471805599453, 'document': 0.28768207245178085, '.': 0.28768207245178085}, 1: {'this': 0.0, 'document': 0.5753641449035617, 'is': 0.0, 'the': 0.0, 'second': 1.3862943611198906, '.': 0.28768207245178085}, 2: {'and': 1.3862943611198906, 'this': 0.0, 'is': 0.0, 'the': 0.0, 'third': 1.3862943611198906, 'one': 1.3862943611198906, '.': 0.28768207245178085}, 3: {'is': 0.0, 'this': 0.0, 'the': 0.0, 'first': 0.6931471805599453, 'document': 0.28768207245178085, '?': 1.3862943611198906}}


In [9]:
# Print the TF-IDF scores for each document 
for i, doc in enumerate(tokenized_corpus): 
    print(f"Document {i}:") 
    for term in tfidf[i]: 
        print(f" {term}: {tfidf[i][term]}") 

Document 0:
 this: 0.0
 is: 0.0
 the: 0.0
 first: 0.6931471805599453
 document: 0.28768207245178085
 .: 0.28768207245178085
Document 1:
 this: 0.0
 document: 0.5753641449035617
 is: 0.0
 the: 0.0
 second: 1.3862943611198906
 .: 0.28768207245178085
Document 2:
 and: 1.3862943611198906
 this: 0.0
 is: 0.0
 the: 0.0
 third: 1.3862943611198906
 one: 1.3862943611198906
 .: 0.28768207245178085
Document 3:
 is: 0.0
 this: 0.0
 the: 0.0
 first: 0.6931471805599453
 document: 0.28768207245178085
 ?: 1.3862943611198906
