# Calculating Term Frequency(tf) and Inverse Document Frequency(idf) in Information Retrieval

In [1]:
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
stopWords = set(list(ENGLISH_STOP_WORDS) + nltk.corpus.stopwords.words('english'))
regex = RegexpTokenizer('\w+')

[nltk_data] Downloading package stopwords to C:\Users\Divay
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
file = open("assi3_data.txt","r").readlines()
file

["1.Boolean model (Doc1) '''The (standard) Boolean model of information retrieval (BIR)[1] is a classical information retrieval (IR) model and, at the same time, the first and most-adopted one. It is used by many IR systems to this day.[citation needed] The BIR is based on Boolean logic and classical set theory in that both the documents to be searched and the user's query are conceived as sets of terms. Retrieval is based on whether or not the documents contain the query terms. '''\n",
 "2.probabilistic relevance model(doc2) '''The probabilistic relevance model[1][2] was devised by Stephen E. Robertson and Karen SpÃ¤rck Jones as a framework for probabilistic models to come. It is a formalism of information retrieval useful to derive ranking functions used by search engines and web search engines in order to rank matching documents according to their relevance to a given search query.It is a theoretical model estimating the probability that a document dj is relevant to a query q. The m

The tf-idf weight is a weight often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus (data-set).

#### Term Frequency
This measures the frequency of a word in a document. This highly depends on the length of the document and the generality of word. TF is individual to each document and word, hence we can formulate TF as follows.

#### Document Frequency
This measures the importance of document in whole set of corpus, this is very similar to TF. The only difference is that TF is frequency counter for a term t in document d, where as DF is the count of occurrences of term t in the document set N.

#### Inverse Document Frequency
IDF is the inverse of the document frequency which measures the informativeness of term t.

\begin{align}
tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
\end{align}

In [3]:
def calculate_tf_idf(file):
    N = len(file)
    tf = []
    word_count = []
    idf = {}
    for doc in file:
        words = regex.tokenize(doc)
        doc_dict = {}
        count = 0
        for word in words:
            word = word.lower()
            if word not in stopWords:
                if word not in idf:
                    idf[word] = 0
                count = count + 1
                if word not in doc_dict:
                    doc_dict[word] = 1
                else:
                    doc_dict[word] = doc_dict[word] + 1
        tf.append(doc_dict)
        word_count.append(count)
        
    #Calculate tf
    for i in range(N):
        for word in tf[i]:
            tf[i][word] = tf[i][word]/word_count[i]
    
    #Calculate tdf
    for word in idf:
        for i in range(N):
            if word in tf[i]:
                idf[word] = idf[word]+1
        idf[word] = np.log(N/(idf[word]+1) +1)

    #Calculate tf-idf
    tf_idf = tf
    for i in range(N):
        for word in tf_idf[i]:
            tf_idf[i][word] = tf[i][word]*idf[word]

    return tf_idf

In [4]:
tf_idf = calculate_tf_idf(file)

tf-idf value for 1st document

In [8]:
tf_idf[0]

{'1': 0.03150669002545206,
 'boolean': 0.06247436808232875,
 'model': 0.03815562190468791,
 'doc1': 0.020824789360776252,
 'standard': 0.020824789360776252,
 'information': 0.02543708126979194,
 'retrieval': 0.03815562190468791,
 'bir': 0.041649578721552505,
 'classical': 0.041649578721552505,
 'ir': 0.041649578721552505,
 'time': 0.020824789360776252,
 'adopted': 0.020824789360776252,
 'used': 0.01271854063489597,
 'systems': 0.020824789360776252,
 'day': 0.020824789360776252,
 'citation': 0.020824789360776252,
 'needed': 0.020824789360776252,
 'based': 0.041649578721552505,
 'logic': 0.020824789360776252,
 'set': 0.01575334501272603,
 'theory': 0.020824789360776252,
 'documents': 0.02543708126979194,
 'searched': 0.020824789360776252,
 'user': 0.01575334501272603,
 'query': 0.03150669002545206,
 'conceived': 0.020824789360776252,
 'sets': 0.020824789360776252,
 'terms': 0.03150669002545206,
 'contain': 0.020824789360776252}

In [24]:
def chcek_query(tf_idf,query):
    N = len(tf_idf)
    vocab = []
    query = regex.tokenize(query)
    for word in query:
        word = word.lower()
        if word not in stopWords:
            vocab.append(word)
    doc_score = []
    for i in range(N):
        score = 0
        for word in vocab:
            if word in tf_idf[i]:
                score = score + tf_idf[i][word]
        doc_score.append(score)
    print(doc_score)
    return doc_score.index(max(doc_score))

Testing with queries

In [37]:
query = "Vector space model"
print("Doc " + str(chcek_query(tf_idf,query)))

[0.03815562190468791, 0.024331121214583595, 0.10631987785261035]
Doc 2


In [38]:
query = "Stephen E. Robertson"
print("Doc " + str(chcek_query(tf_idf,query)))

[0, 0.029879045604592015, 0]
Doc 1


In [39]:
query = "classical information retrieval"
print("Doc " + str(chcek_query(tf_idf,query)))

[0.10524228189603235, 0.012165560607291797, 0.03108976599641237]
Doc 0
