# IT550 - Information Retrieval
## Assignment 2

## Importing the necessary libraries

In [1]:
import nltk
import os
import re
import json
import numpy as np
from bs4 import BeautifulSoup

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Preprocessing the business dataset

In [2]:
# Create a class which implements functions to preprocess the business dataset documents
class PreProcess:
    
    def __init__(self, loadDict=None):
        '''
        Initialises a dictionary which will store document IDs or No.s as keys and the text content as their corresponding values.
        '''
        if loadDict is None:
            self.docText = dict()
        else:
            self.docText = loadDict.copy()
    
    def extractDocText(self, path, fileList):
        '''
        Extracts doc no. and text tag contents with digits and punctuation removed from the list of files "fileList" and stores it in a dictionary. 
        '''
        for idx, file in enumerate(fileList):
            with open(os.path.join(path, file), encoding="utf8") as fileData:
                soup = BeautifulSoup(fileData, features="html.parser")

                docNo = soup.find('docno').text
                text = soup.find('text').text

                cleanedText = re.sub(r'[^a-zA-Z\s]', '', text)
                cleanedText = "".join(filter(lambda x: not x.isdigit(), cleanedText))

                self.docText[docNo] = cleanedText
            
            if idx % 100 == 0:
                print(f"{idx+1} documents extracted.")
    
    def tokenizeText(self):
        '''
        Tokenizes the text contents of extracted docs using the NLTK library and stores a list for the docNo key in dictionary. 
        '''
        stopwords = nltk.corpus.stopwords.words("english")
        self.docText = {docNo: [word.lower() for word in nltk.word_tokenize(text) if not word.lower() in stopwords] for docNo, text in self.docText.items()}
    
    def lemmatizeText(self):
        '''
        Lemmatizes text words using NLTK library
        '''
        # Creating WordNetLemmatizer object for lemmatization of words
        lemmatizer = nltk.stem.WordNetLemmatizer()

        # Defining a function to return an appropriate POS tag to pass
        # as a parameter to lemmatizer.lemmatize()
        def get_wordnet_pos(word):
            """Map POS tag to first character lemmatize() accepts"""
            tag = nltk.pos_tag([word])[0][1][0].upper()
            tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                        "N": nltk.corpus.wordnet.NOUN,
                        "V": nltk.corpus.wordnet.VERB,
                        "R": nltk.corpus.wordnet.ADV}

            return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)
        
        self.docText = {docNo: [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens] for docNo, tokens in self.docText.items()}


In [3]:
folder = "drive/MyDrive/business"
files = [file for file in os.listdir(folder) if re.search("index", file) is None]
print(len(files), len(os.listdir(folder)))
print(files)

preProcessor = PreProcess()
preProcessor.extractDocText(folder, files)
print("extractDocText finished.")

1994 2115
['1041206_business_story_4091977.utf8', '1041207_business_story_4095767.utf8', '1041203_business_story_4080195.utf8', '1041207_business_story_4095406.utf8', '1041204_business_story_4085018.utf8', '1041205_business_story_4088444.utf8', '1041204_business_story_4083864.utf8', '1041204_business_story_4082772.utf8', '1041204_business_story_4085046.utf8', '1041204_business_story_4085047.utf8', '1041204_business_story_4084143.utf8', '1041209_business_story_4103326.utf8', '1041209_business_story_4105021.utf8', '1041208_business_story_4100555.utf8', '1041207_business_story_4095832.utf8', '1041208_business_story_4100496.utf8', '1041209_business_story_4104957.utf8', '1041209_business_story_4104958.utf8', '1041208_business_story_4100564.utf8', '1041208_business_story_4100557.utf8', '1041208_business_story_4093886.utf8', '1041207_business_story_4095871.utf8', '1041208_business_story_4100554.utf8', '1041209_business_story_4105465.utf8', '1041208_business_story_4100556.utf8', '1041209_busin

In [None]:
with open("drive/MyDrive/docText.json", "w") as docTextJson:
    docTextJson.write(json.dumps(preProcessor.docText, indent=2))

In [4]:
preProcessor.tokenizeText()
print("tokenizeText finished.")

tokenizeText finished.


In [5]:
preProcessor.lemmatizeText()
print("lemmatizeText finished.")

lemmatizeText finished.


In [6]:
'''
This code cell creates a nested dictionary of frequencies of each unique words in the respective document with outer key being the <DOCNO> content
and its corresponding value being a dictionary with words as keys and their frequencies as values.
The totalCount attribute stores total count of terms/words for each document and is for convenience.
The commented code saves the dictionary as JSON for convenience.
'''
docTextDict = preProcessor.docText
wordCountDict = {docNo: {"totalCount": 0, "wordCount": dict()} for docNo, words in docTextDict.items()}
for docNo, words in docTextDict.items():
    for word in words:
        wordCountDict[docNo]["wordCount"][word] = wordCountDict[docNo]["wordCount"].get(word, 0) + 1    
    
    wordCountDict[docNo]["totalCount"] = len(wordCountDict[docNo]["wordCount"].keys())


# with open("drive/MyDrive/wordCountJson.json", "w") as wordCountJson:
#     wordCountJson.write(json.dumps(wordCountDict, indent=2))

## Creating Term-Document Matrix based on TF-IDF score from scratch.
A class named `CustomTFIDFVectorizer` is created for this assignment which will take `wordCountDict` dictionary created in the previous code cell as input and will create a **Term-Document Matrix** based on the **TF-IDF score**.

The implementation style is inspired from `TfidfVectorizer` methods of `sklearn` library.

In [None]:
# This code cell is for convenience.
with open("drive/MyDrive/wordCountJson.json") as wordCountJson:
    wordCountDict = json.loads(wordCountJson.read())

In [7]:
class CustomTFIDFVectorizer:

    def __init__(self):
        self.wordToID = dict()
        self.docNoToID = dict()
        self._tfidfMatrix = 0
        self._tfIdfTransform = dict()
    
    def fit(self, wordCountDict):
        '''
        Returns a term-document numpy matrix from the given wordCountDict dictionary.
        '''
        ################## Compute Term Frequency TF ##################
        ## Creates a term frequency nested dictionary of similar style as wordCountDict but without totalCount attribute.
        ## tfDict = {docNo1: {word_docNo1: tf, word_docNo1: tf, ...}, docNo2: {word_docNo2: tf, word_docNo2: tf, ...}, ...}
        tfDict = {docNo: wcDict["wordCount"].copy() for docNo, wcDict in wordCountDict.items()}
        
        for docNo in tfDict.keys():
            tot = float(wordCountDict[docNo]["totalCount"])
            for word in tfDict[docNo].keys():
                tfDict[docNo][word] = tfDict[docNo].get(word, 0.0) / tot

        # print(tfDict.get("1041206_business_story_4091977.utf8", {}))

        ################## Compute Inverse Document Frequency IDF ##################
        ## Create a words corpus dictionary with their respective IDF scores
        ## idfDict = {word1: idf, word2: idf, ...}
        import math
        idfDict = dict()
        N = len(wordCountDict.keys())

        for wordsDict in tfDict.values():
            for word in wordsDict.keys():
                idfDict[word] = idfDict.get(word, 0) + 1
        
        idfDict = {word: math.log10(N / float(freq)) for word, freq in idfDict.items()}
        
        ################## Compute TF-IDF Score for each word respectively for each document ##################
        ## Creates TF-IDF nested dictionary of similar style as wordCountDict but now each word has its respective TF-IDF score
        ## tfDict = {docNo1: {word_docNo1: tf-idf, word_docNo1: tf-idf, ...}, docNo2: {word_docNo2: tf-idf, word_docNo2: tf-idf, ...}, ...}
        tfidfDict = tfDict.copy()

        for docNo in tfDict.keys():
            for word, tf in tfDict[docNo].items():
                tfidfDict[docNo][word] = tf * idfDict[word]
                # if docNo == "1041206_business_story_4091977.utf8":
                #     print(word, tf, idfDict[word])
        
        # print(self._tfidfMatrix.get("1041206_business_story_4091977.utf8", {}))
        
        wordCorpusLength = len(idfDict.keys())
        self._tfidfMatrix = np.zeros((N, wordCorpusLength))

        self.wordToID = {word: id for id, word in enumerate(sorted(idfDict))}
        self.docNoToID = {docNo: id for id, docNo in enumerate(sorted(tfidfDict.keys()))}

        for docNo, idd in sorted(self.docNoToID.items(), key= lambda id: id[1]):
            for word, idw in sorted(self.wordToID.items(), key= lambda id: id[1]):
                self._tfidfMatrix[idd, idw] = tfidfDict[docNo].get(word, 0.0)

        ## Some initialization for the transform() method 
        self._tfIdfTransform = {docNo: dict() for docNo in sorted(wordCountDict.keys())}
        
        ## self._tfidfMatrix = array([[0.0 0.0 tf-idf ... 0.0 tf-idf 0.0], [0.0 tf-idf 0.0 ... tf-idf 0.0 0.0], ...])
        ## It will be a sparse matrix
        return self._tfidfMatrix
    
    def transform(self):
        '''
        Returns a nested dictionary with each row of tfidfMatrix converted to a dictionary with their words and scores as the value for the docno as key.
        Format: {docNo1: {term1: tf-idf, term2: tf-idf, ...}, docNo2: {term1: tf-idf, term2: tf-idf, ...}, ...}
        '''
        if type(self._tfidfMatrix) == int:
            raise ValueError("Call fit(wordCountDict) first or instead use fit_transform(wordCountDict).")

        for docNo, idd in sorted(self.docNoToID.items(), key= lambda id: id[1]):
            for word, idw in sorted(self.wordToID.items(), key= lambda id: id[1]):
                self._tfIdfTransform[docNo][word] = self._tfidfMatrix[idd, idw]

        return self._tfIdfTransform
    
    def fit_transform(self, wordCountDict):
        '''
        A better way instead of calling both the functions separately.
        '''
        if type(self._tfidfMatrix) != int:
            return self.transform()
        
        self.fit(wordCountDict)
        return self.transform()


In [8]:
customTFIDF = CustomTFIDFVectorizer()
tfIdfMatrix = customTFIDF.fit(wordCountDict)
print(tfIdfMatrix.shape)

(1994, 19219)


In [9]:
# print(customTFIDF.docNoToID['1040901_business_story_3700171.utf8'])
# print(tfIdfMatrix[0])
tfIdfDict = customTFIDF.fit_transform(wordCountDict)
print(len(tfIdfDict), len(tfIdfDict['1040901_business_story_3700171.utf8']))

1994 19219


## Creating Term-Document Matrix based on TF-IDF Score using `sklearn` library.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(text):
    tokens = [word.lower() for word in nltk.word_tokenize(text)]
    # Creating WordNetLemmatizer object for lemmatization of words
    lemmatizer = nltk.stem.WordNetLemmatizer()

    # Defining a function to return an appropriate POS tag to pass
    # as a parameter to lemmatizer.lemmatize()
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                    "N": nltk.corpus.wordnet.NOUN,
                    "V": nltk.corpus.wordnet.VERB,
                    "R": nltk.corpus.wordnet.ADV}

        return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)
    
    lemmaTokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmaTokens

with open("drive/MyDrive/docText.json") as docTextJson:
    docTextDict = json.loads(docTextJson.read())

vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=nltk.corpus.stopwords.words("english"))
response = vectorizer.fit_transform(docTextDict.values())

print(response.shape)

  'stop_words.' % sorted(inconsistent))


(1994, 19201)


### Showing words collected using both custom approach and sklearn's `TfidfVectorizer` approach
Here we can observe that TfidfVectorizer has 18 less words in its features as compared to 19219 words from the custom tf-idf vectorizer.

In [11]:
print(sorted(customTFIDF.wordToID))
print()
print(vectorizer.get_feature_names())




## Picking first five documents from the list and showing the TF-IDF scores of top 5 words representing that documents

In [12]:
# tfIdfDict = {docNo1: {word1: score, word2: score, ...}, docNo2: {word1: score, word2: score, ...}, ...}
# topFiveForDocs = {docNo1: [(top_word1, score), (top_word2, score), ... 5 tuples],
#                   docNo2: [(top_word1, score), (top_word2, score), ... 5 tuples], ... 5 pairs}
topFiveForDocs = {docNo: sorted(tfIdfDict[docNo].items(), key=lambda item: item[1], reverse=True)[:5]
                  for docNo in sorted(tfIdfDict)[:5]}

print("Top five words with their respective tf-idf scores for first five documents:\n")
for docs, topFive in topFiveForDocs.items():
    print(docs, ":")
    for word, score in topFive:
        print("\t", word, ":", score)
    print()

Top five words with their respective tf-idf scores for first five documents:

1040901_business_story_3700171.utf8 :
	 kanoria : 0.02971161999216815
	 hikal : 0.02839647539629131
	 iso : 0.027376369996206504
	 consolidation : 0.026635915219830025
	 library : 0.02654288319570519

1040901_business_story_3700827.utf8 :
	 policy : 0.038379036962406764
	 maidan : 0.03433620410280522
	 pragati : 0.03433620410280522
	 trade : 0.033158177959627305
	 export : 0.03140103024196917

1040901_business_story_3701515.utf8 :
	 patni : 0.10159199803279757
	 centre : 0.05328544516458861
	 anna : 0.051558205530869324
	 salai : 0.051558205530869324
	 patnis : 0.04685461184861962

1040901_business_story_3701518.utf8 :
	 bharat : 0.07536056714323537
	 petro : 0.06394091537786792
	 kochi : 0.04483654977782154
	 behuria : 0.04342467537316884
	 refinery : 0.038471400115950134

1040901_business_story_3701887.utf8 :
	 uti : 0.17701796202229733
	 mip : 0.10133189442932573
	 hdfc : 0.0858268906774775
	 plus : 0.0776

### Repeating the above task but now with `TfidfVectorizer`.
Different results for top 5 words and their tf-idf scores are observed in custom tf-idf vectorizer and `TfidfVectorizer`.

The scores are different as TfidfVectorizer uses `smooth_idf` so that words with very low tf-idf score are not suppressed entirely as per the documentation.

Here we use the `pandas` library to get the documents and the words more easily (`nlargest()` to get top 5 words) for the TfidfVectorizer's response.

In [13]:
# words = vectorizer.get_feature_names()
# [sorted(response.A[i], reverse=True)[:5] for i in range(5)]
import pandas as pd
tfIdfPd = pd.DataFrame(response.A.T, vectorizer.get_feature_names(), docTextDict.keys())

In [14]:
top5Docs = sorted(tfIdfDict)[:5]
docScores = [tfIdfPd[doc] for doc in top5Docs]
for dScore in docScores:
    print(dScore.nlargest(), end='\n\n')

consolidation    0.181709
content          0.179970
kanoria          0.179741
solution         0.177517
idbi             0.177504
Name: 1040901_business_story_3700171.utf8, dtype: float64

policy     0.278013
trade      0.236067
export     0.227465
maidan     0.183362
pragati    0.183362
Name: 1040901_business_story_3700827.utf8, dtype: float64

patni     0.452203
centre    0.314644
anna      0.210076
salai     0.210076
patnis    0.199301
Name: 1040901_business_story_3701515.utf8, dtype: float64

bharat         0.436715
petro          0.346689
kochi          0.232765
refinery       0.216753
corporation    0.213856
Name: 1040901_business_story_3701518.utf8, dtype: float64

uti       0.481479
mip       0.263963
hdfc      0.233444
plus      0.209383
equity    0.197967
Name: 1040901_business_story_3701887.utf8, dtype: float64

