In [135]:
#Libraries
import glob
import nltk
nltk.download('popular');
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
from collections import Counter
import numpy as np
from collections import OrderedDict
import multiprocessing 
from multiprocessing import Pool

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/dia.srivastava.12/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/dia.srivastava.12/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/dia.srivastava.12/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/dia.srivastava.12/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/dia.srivastava.12/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/dia.srivastava.1

In [206]:
Stop = stopwords.words('english') + list(string.punctuation) + ['\n']

In [234]:
def give_path(fld_path):                             #give path of the folder containing all documents
    dic = {}
    file_names = glob.glob(fld_path)
    files_150 = file_names[0:10]
    for file in files_150:
        name = file.split('/')[-1]
        with open(file, 'r', errors='ignore') as f:
            data = f.read()
        dic[name] = data
    return dic


In [235]:
def wordList_removePuncs(doc_dict):

    wordList = []
    words_in_doc = {}
    for doc_id, doc in doc_dict.items():
        for word in word_tokenize(doc.lower().strip()):  # Tokenize the text into words
            if word not in Stop:  # Exclude stopwords and punctuation
                wordList.append(word)  # Add word to list
        words_in_doc[doc_id] = len(wordList)  # Store number of words in the document
    return wordList, words_in_doc

In [236]:
def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id in doc_dict.keys():
        tf_docs[doc_id] = {}
    
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return tf_docs

In [237]:
def logNorm_tf(tf_doc):
    ln_tf={}
    for doc_id, doc in tf_doc.items():
        ln_tf[doc_id] = {word: 1 + np.log(tf) if tf > 0 else 0 for word, tf in doc.items()}
    return ln_tf    

In [238]:
def wordDocFre(vocab, doc_dict):
    df = {word: 0 for word in vocab}
    
    for doc in doc_dict.values():
        words = set(word_tokenize(doc.lower().strip()))
        for word in words:
            if word in vocab:
                df[word] += 1
    
    return df

In [239]:
def inverseDocFre(vocab,doc_fre,length):
    idf= {} 
    for word in vocab:     
        idf[word] = np.log2((length+1) / doc_fre[word])
    return idf

In [240]:
def tfidf(vocab,tf,idf_scr,doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return tf_idf_scr

In [241]:
def vectorSpaceModel(query, doc_dict,tfidf_scr):
    query_vocab = []
    for word in query.split():
        if word not in query_vocab:
            query_vocab.append(word)
    query_wc = {}
    for word in query_vocab:
        query_wc[word] = query.lower().split().count(word)
        relevance_scores = {}
    for doc_id in doc_dict.keys():
        score = 0
        for word in query_vocab:
            score += query_wc[word] * tfidf_scr[doc_id][word]
        relevance_scores[doc_id] = score
    sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
    top_5 = {k: sorted_value[k] for k in list(sorted_value)[:5]}
    return top_5

In [242]:
def tf_norm_words(docs, words_in_docs, tf_dict):
    tf_norm = {
        doc_id: {
            word: tf / words_in_docs[doc_id] for word, tf in tf_dict[doc_id].items()
        } for doc_id in docs.keys()
    }
    return tf_norm

In [244]:
if __name__  == "__main__":
    path = 'ACL/*.txt'
    docs = give_path(path)                        #returns a dictionary of all docs
    print('Documents Fetched:', docs.keys())
    print('Fetching files: 10')
    
    M = len(docs) 
      #number of files in dataset
    w_List, words_in_docs = wordList_removePuncs(docs)           #returns a list of tokenized words
    vocab = list(set(w_List))                     #returns a list of unique words
    print('Number of Docs:', M)
    
    
    
    print('Calculating TF-IDF scores... without normalization')
    tf_dict = termFrequencyInDoc(vocab, docs)     #returns term frequency
    df_dict = wordDocFre(vocab, docs)             #returns document frequencies
    idf_dict = inverseDocFre(vocab,df_dict,M)     #returns idf scores
    print('TF-IDF scores calculated without normalization')
    
    
    print('Calculating TF-IDF scores... with number of words in a doc normalization')
    tf_norm = tf_norm_words(docs, words_in_docs, tf_dict)
    tf_idf_norm = tfidf(vocab, tf_norm, idf_dict, docs)  #returns tf-idf socres
    print('TF-IDF scores calculated with number of words in a doc normalization')
    
    
    print('Calculating TF-IDF scores... with log normalization')
    tf_log = logNorm_tf(tf_dict)
    tf_idf_log = tfidf(vocab, tf_log, idf_dict, docs)
    print('TF-IDF scores calculated with log normalization', end='\n\n')

    queries=[
        'text',
        'LDA',
        'topic modeling',
        'Natural language Processing',
        'generative models'
    ]
    
    for query in queries:
        top_documents = vectorSpaceModel(query, docs, tf_idf)
        top_documents_norm = vectorSpaceModel(query, docs, tf_idf_norm)
        top_documents_log = vectorSpaceModel(query, docs, tf_idf_log)
        print('\n')
        print('Top 5 Documents for Query "{}":'.format(query), end='\n\n')
        print('Without Normalization:', end='\t')
        print(top_documents, end='\n\n')

Documents Fetched: dict_keys(['W09-3025.pdf.txt', 'P87-1010.pdf.txt', 'W14-5314.pdf.txt', 'J13-4002.pdf.txt', 'W98-1306.pdf.txt', 'W00-1324.pdf.txt', 'W08-2121.pdf.txt', 'P09-4008.pdf.txt', 'P14-1053.pdf.txt', 'P84-1066.pdf.txt'])
Fetching files: 10
Number of Docs: 10
Calculating TF-IDF scores... without normalization
TF-IDF scores calculated without normalization
Calculating TF-IDF scores... with number of words in a doc normalization
TF-IDF scores calculated with number of words in a doc normalization
Calculating TF-IDF scores... with log normalization
TF-IDF scores calculated with log normalization



KeyError: 'J13-4002.pdf.txt'

In [250]:
print("""Without normalisation, term frequencies are not adjusted for document length, resulting in larger raw term frequencies in lengthier documents, potentially biassing comparisons.
Longer documents may not necessarily have an advantage in terms of phrase frequency when normalised for word count.
      It accurately represents the value of terms within papers.
Log normalisation helps balance the relevance of terms in a document by reducing the impact of high term frequency.""")

Without normalisation, term frequencies are not adjusted for document length, resulting in larger raw term frequencies in lengthier documents, potentially biassing comparisons.
Longer documents may not necessarily have an advantage in terms of phrase frequency when normalised for word count.
      It accurately represents the value of terms within papers.
Log normalisation helps balance the relevance of terms in a document by reducing the impact of high term frequency.
