## Importing libraries

In [1]:
from sentenceSegmentation import SentenceSegmentation
from tokenization import Tokenization
from inflectionReduction import InflectionReduction
from stopwordRemoval import StopwordRemoval
from informationRetrieval import InformationRetrieval
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

from sys import version_info
# import argparse
import json
import time
import matplotlib.pyplot as plt
from util import *
# from main import SearchEngine
from evaluation import Evaluation
import string

import warnings
warnings.filterwarnings("ignore")

## Data importing and pre-processing

In [2]:
brown_corpus_docs = pd.read_csv('./New Corpus/Brown_Corpus_Extracted.csv')

In [3]:
brown_corpus_docs.head()

Unnamed: 0,filename,docs
0,cd05,"Furthermore , as an encouragement to revisioni..."
1,cf37,The missionary obligation to proclaim the gosp...
2,cj50,"Unfortunately , however , and for reasons to b..."
3,cf08,"In tradition and in poetry , the marriage bed ..."
4,cl06,"Eight , nine steps above him , Roberts had pau..."


In [4]:
brown_corpus_docs['preprocessed'] = brown_corpus_docs["docs"].str.lower()

In [5]:
p1 = SentenceSegmentation()
p2 = Tokenization()
p3 = InflectionReduction()
p4 = StopwordRemoval()

In [6]:
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda x : p1.punkt(x))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda x : p2.pennTreeBank(x))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda x : p3.reduce(x))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda x : p4.fromList(x))

In [7]:
# Read queries
queries_json = json.load(open( ".\cranfield\cran_queries.json", 'r'))[:]
query_ids, queries = [item["query number"] for item in queries_json], \
                        [item["query"] for item in queries_json]

# Read documents
docs_json = json.load(open(".\cranfield\cran_docs.json", 'r'))[:]
doc_ids, docs = [item["id"] for item in docs_json], \
                        [item["body"] for item in docs_json]
# Loading preprocessed queries
Preprocessed_queries = json.load(open('stopword_removed_queries.txt', 'r'))
# Loading Preprocessed docs
Preprocessed_docs = json.load(open('stopword_removed_docs.txt', 'r'))

qrels = json.load(open( "./cranfield/cran_qrels.json", 'r'))[:]

In [8]:
new_corpus = brown_corpus_docs['preprocessed'].tolist()
total_corpus = Preprocessed_docs + Preprocessed_queries + new_corpus

In [9]:
# Reference
evaluator = Evaluation()
def Evaluation_metrics(doc_IDs_ordered, query_ids, qrels, n_comp, op_folder = './',save_results = 2, verbose = 1):
    """
    save_results : 0    ===> don't save anything
                 : 1    ===> just save results
                 : > 2  ===> save plots also
    """
    precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], []
    for k in range(1,11):
        precision = evaluator.meanPrecision(
            doc_IDs_ordered, query_ids, qrels, k)
        precisions.append(precision)
        recall = evaluator.meanRecall(
            doc_IDs_ordered, query_ids, qrels, k)
        recalls.append(recall)
        fscore = evaluator.meanFscore(
            doc_IDs_ordered, query_ids, qrels, k)
        fscores.append(fscore)

        MAP = evaluator.meanAveragePrecision(
            doc_IDs_ordered, query_ids, qrels, k)
        MAPs.append(MAP)
        nDCG = evaluator.meanNDCG(
            doc_IDs_ordered, query_ids, qrels, k)
        nDCGs.append(nDCG)
        if (verbose):
            print("Precision, Recall and F-score @ " +  
                str(k) + " : " + str(precision) + ", " + str(recall) + 
                ", " + str(fscore))
            print("MAP, nDCG @ " +  
                str(k) + " : " + str(MAP) + ", " + str(nDCG))
        if (save_results > 0):
        # saving the results
            with open(op_folder+'Results/LSA_'+str(n_comp)+'.txt', 'a') as f:
                f.write(str(k) + " , " + str(precision) + ", " + str(recall) + 
                        ", " + str(fscore)+", "+str(MAP) + ", " + str(nDCG)+'\n')
            with open(op_folder+'Results/metrics_'+str(k)+'.txt', 'a') as f:
                f.write(str(n_comp) + " , " + str(precision) + ", " + str(recall) + 
                        ", " + str(fscore)+", "+str(MAP) + ", " + str(nDCG)+'\n')
            
    # Plot the metrics and save plot 
    if (save_results > 1):
        plt.figure()
        plt.plot(range(1, 11), precisions, label="Precision")
        plt.plot(range(1, 11), recalls, label="Recall")
        plt.plot(range(1, 11), fscores, label="F-Score")
        plt.plot(range(1, 11), MAPs, label="MAP")
        plt.plot(range(1, 11), nDCGs, label="nDCG")
        plt.legend()
        plt.title("Evaluation Metrics - LSA "+str(n_comp))
        plt.xlabel("k")
        plt.savefig(op_folder + "Plots/LSA_"+str(n_comp)+".png")
        


# TF-idf using Vectorizer

In [10]:
import itertools
merged_total_docs = []
for docs in total_corpus:
    merged = ' '.join(list(itertools.chain(*docs)))
    merged_total_docs += [merged]

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(merged_total_docs)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

Total number of docs and queries included are 2125 and Vocabulary size is 34872


Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10,...,doc_2116,doc_2117,doc_2118,doc_2119,doc_2120,doc_2121,doc_2122,doc_2123,doc_2124,doc_2125
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.107015,0.015161,0.0,0.0,0.0
0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000degree,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
tf_idf_docs = tfidf_df[columns[:len(Preprocessed_docs)]].values
vec_rep_queries = tfidf_df[columns[len(Preprocessed_docs):len(Preprocessed_docs) + len(Preprocessed_queries)]].values

In [32]:
# word_map = build_word_index(Preprocessed_docs, doc_ids)

# # TF-IDF representation
# tf_idf_docs = TF_IDF(Preprocessed_docs, doc_ids, word_map, normalize = True)
# vec_rep_queries = TF_IDF(Preprocessed_queries, query_ids, word_map, normalize = True, is_queries= True)

cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_queries )
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()

In [33]:
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

Precision, Recall and F-score @ 1 : 0.6844444444444444, 0.11682720282918488, 0.19122503049513545
MAP, nDCG @ 1 : 0.6844444444444444, 0.5185185185185185
Precision, Recall and F-score @ 2 : 0.56, 0.18175337913453635, 0.25746452014559285
MAP, nDCG @ 2 : 0.7177777777777777, 0.41077004154365504
Precision, Recall and F-score @ 3 : 0.5096296296296297, 0.23954680755481814, 0.30282208900281954
MAP, nDCG @ 3 : 0.7288888888888893, 0.3931538951309575
Precision, Recall and F-score @ 4 : 0.45555555555555555, 0.28427692701300766, 0.32434011316712025
MAP, nDCG @ 4 : 0.7204938271604941, 0.3867247107075635
Precision, Recall and F-score @ 5 : 0.4124444444444446, 0.3134559807241778, 0.32864933269170726
MAP, nDCG @ 5 : 0.7176234567901238, 0.38842838741781793
Precision, Recall and F-score @ 6 : 0.38148148148148125, 0.3410360364708999, 0.33240844115561025
MAP, nDCG @ 6 : 0.7129086419753086, 0.39511801882172287
Precision, Recall and F-score @ 7 : 0.3612698412698417, 0.3714738821172082, 0.3380591963497686
MAP,

### LSA Tuning with various components

In [34]:
components_used = [20, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# components_used = [1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000]
# components_used = [1,10]
for n_comp in components_used:
    svd = TruncatedSVD(n_components=n_comp)
    svd.fit(tf_idf_docs.T)
    tr_docs = svd.transform(tf_idf_docs.T).T
    # tr_docs.shape
    qr_tr = svd.transform(vec_rep_queries.T).T
    # qr_tr.shape
    cosine_sim = np.matmul(tr_docs.T, qr_tr )
    doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    print("\nLSA with "+str(n_comp)+" in progress\n")
    Evaluation_metrics(doc_IDs_ordered, query_ids, qrels, n_comp, op_folder='./tf_idf_matrix/')


LSA with 20 in progress

Precision, Recall and F-score @ 1 : 0.22666666666666666, 0.02905521288328305, 0.049906078568340395
MAP, nDCG @ 1 : 0.22666666666666666, 0.11555555555555551


FileNotFoundError: [Errno 2] No such file or directory: './tf_idf_matrix/Results/LSA_20.txt'

### Clustering documents to reduce search time

In [None]:
km = KMeans(n_clusters= 3, random_state=0)
km.fit(tf_idf_docs.T)
km.cluster_centers_.shape

In [None]:
cluster_doc_ids = {}
for i in range(1400):
    try :
        cluster_doc_ids[km.labels_[i]] += [i]
    except :
        cluster_doc_ids[km.labels_[i]] = [i]

In [None]:
# cluster_query = np.matmul(vec_rep_queries.T, km.cluster_centers_.T)
# cluster_query = np.argmax(cluster_query, axis = 1)
# cluster_query.shape

# normal method of retrieval
tic = time.time()
cosine_sim = []
for i in range(1400):
    cosine_sim.append(np.matmul(tf_idf_docs[:,i].T, vec_rep_queries[:,0]))
cosine_sim = np.array(cosine_sim)
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
print(len(doc_IDs_ordered))
toc = time.time()
print("without clustering, Retrieval time : "+str(toc-tic))

# clustering method
tic = time.time()
cluster = np.argmax(np.matmul(vec_rep_queries[:,0], km.cluster_centers_.T))
cluster_docs = tf_idf_docs[:, cluster_doc_ids[cluster]]
# cosine_sim = np.matmul(tf_idf_docs[:, cluster_doc_ids[cluster]].T,vec_rep_queries[:,0])
cosine_sim = np.matmul(cluster_docs.T,vec_rep_queries[:,0])
doc_IDs_ordered_clus = (np.argsort(cosine_sim,axis=0))[::-1].T.tolist()
print(len(doc_IDs_ordered_clus))
doc_IDs_ordered = np.array(cluster_doc_ids[cluster])[doc_IDs_ordered_clus]+1
toc = time.time()
print("clustering method, Retrieval time : "+str(toc-tic))


In [None]:
## checking for the compensation made
doc_IDs_ordered = []
for qry_idx in range(225):
    cluster = np.argmax(np.matmul(vec_rep_queries[:,qry_idx], km.cluster_centers_.T))
    cluster_docs = tf_idf_docs[:, cluster_doc_ids[cluster]]
    # cosine_sim = np.matmul(tf_idf_docs[:, cluster_doc_ids[cluster]].T,vec_rep_queries[:,0])
    cosine_sim = np.matmul(cluster_docs.T,vec_rep_queries[:,qry_idx])
    doc_IDs_ordered_clus = (np.argsort(cosine_sim,axis=0))[::-1].T.tolist()
    # print(len(doc_IDs_ordered_clus))
    doc_IDs_ordered.append((np.array(cluster_doc_ids[cluster])[doc_IDs_ordered_clus]+1).tolist())
    


In [None]:
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

### Observation
- Clustering reduced the retrieval time by 5

## Query Expansion -- distributional word similarity

In [None]:
from gensim.models import Word2Vec

In [None]:
All_sentences = []
for doc in Preprocessed_docs:
    for sent in doc:
        All_sentences.append(sent)
All_sentences

In [None]:
model = Word2Vec(min_count=1,
                window = 3,
                size = 500,
                sample = 6e-5)
t = time.time()
model.build_vocab(All_sentences, progress_per = 1000)
print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
t = time.time()

model.train(All_sentences, total_examples=model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
res = np.array(model.wv.most_similar(positive=["good"]))[:,0].tolist()
res

In [None]:
def query_expansion(query, wv_model,v=1):
    """
    query : query to be expanded (a list of lists, where each sublist is a sentence)
    wv_model : word2vec trained model
    v : top v similar words taken into consideration
    """
    expanded_query = query.copy()
    for sent in query:
        
        for word in sent:
            
            try:
                expanded_query.append(np.array(wv_model.wv.most_similar(positive=[word],topn = v))[:,0].tolist())
            except:
                pass
    return expanded_query

In [None]:
# sample return of query_expansion result
query_expansion([["investigation","bad"]],model,v=1)

In [None]:
Preprocessed_queries[0]

In [None]:
expanded_queries = []
for query in Preprocessed_queries:
    expanded_queries.append(query_expansion(query,model2, v=1))

In [None]:
vec_rep_exp_queries = TF_IDF(expanded_queries, query_ids, word_map, is_queries=True)

In [None]:
cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_exp_queries )
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()

In [None]:
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

In [None]:
from nltk.corpus import brown

In [None]:
brown.sents()

In [None]:
t = time.time()
model2 = Word2Vec(brown.sents(),min_count=1,
                window = 2,
                size = 200,
                sample = 6e-5)
# model.build_vocab(, progress_per = 10000)
print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
t = time.time()

model2.train(All_sentences, total_examples=model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
res = np.array(model2.wv.most_similar(positive=["good"]))[:,0].tolist()
res

In [None]:
Sentences = All_sentences+brown.sents()
Sentences

In [None]:
t = time.time()
model2 = Word2Vec(Sentences,min_count=1,
                window = 2,
                size = 300,
                sample = 6e-5)
# model.build_vocab(, progress_per = 10000)
print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
t = time.time()

model2.train(All_sentences, total_examples=model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))