In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from util import *
import time
from evaluation import Evaluation
import string
#from gensim.parsing.preprocessing import STOPWORDS
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

In [2]:
# Read queries
queries_json = json.load(open( ".\cranfield\cran_queries.json", 'r'))[:]
query_ids, queries = [item["query number"] for item in queries_json], \
                        [item["query"] for item in queries_json]

# Read documents
docs_json = json.load(open(".\cranfield\cran_docs.json", 'r'))[:]
doc_ids, docs = [item["id"] for item in docs_json], \
                        [item["title"] for item in docs_json]

qrels = json.load(open( "./cranfield/cran_qrels.json", 'r'))[:]


In [4]:
docs_df = pd.DataFrame(docs_json).drop(['author','bibliography','id','body'],axis=1)
docs_df.head()

Unnamed: 0,title
0,experimental investigation of the aerodynamics...
1,simple shear flow past a flat plate in an inco...
2,the boundary layer in simple shear flow past a...
3,approximate solutions of the incompressible la...
4,one-dimensional transient heat conduction into...


# Preprocessing

In [5]:
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"she'll", "she will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[^a-z]", " ", text) # This removes anything other than lower case letters(very imp)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    PUNCT_TO_REMOVE = string.punctuation + '“' + '”'+'’' + '_'
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [6]:
query_df = pd.DataFrame(queries,columns = ['query'])

In [7]:
docs_df['preprocessed'] = docs_df['title'].str.lower()
query_df['preprocessed'] = query_df['query'].str.lower()

docs_df['preprocessed'] = docs_df['preprocessed'].apply(clean_text)
query_df['preprocessed'] = query_df['preprocessed'].apply(clean_text)

docs_df['preprocessed'] = docs_df['preprocessed'].apply(lambda text: remove_punctuation(text))
query_df['preprocessed'] = query_df['preprocessed'].apply(lambda text: remove_punctuation(text))

docs_df['preprocessed'] = docs_df['preprocessed'].apply(lambda text: remove_stopwords(text))
query_df['preprocessed'] = query_df['preprocessed'].apply(lambda text: remove_stopwords(text))

docs_df['preprocessed'] = docs_df['preprocessed'].apply(lambda text: lemmatize_words(text))
query_df['preprocessed'] = query_df['preprocessed'].apply(lambda text: lemmatize_words(text))

In [8]:
docs_df.head()

Unnamed: 0,title,preprocessed
0,experimental investigation of the aerodynamics...,experimental investigation aerodynamics wing s...
1,simple shear flow past a flat plate in an inco...,simple shear flow past flat plate incompressib...
2,the boundary layer in simple shear flow past a...,boundary layer simple shear flow past flat plate
3,approximate solutions of the incompressible la...,approximate solution incompressible laminar bo...
4,one-dimensional transient heat conduction into...,one dimensional transient heat conduction doub...


In [9]:
corpus = docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist()

# TF-IDF and Evaluation

In [10]:
evaluator = Evaluation()
def Evaluation_metrics(doc_IDs_ordered, query_ids, qrels, n_comp, op_folder = './',save_results = 2, verbose = 1):
    """
    save_results : 0    ===> don't save anything
                 : 1    ===> just save results
                 : > 2  ===> save plots also
    """
    precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], []
    for k in range(1,11):
        precision = evaluator.meanPrecision(
            doc_IDs_ordered, query_ids, qrels, k)
        precisions.append(precision)
        recall = evaluator.meanRecall(
            doc_IDs_ordered, query_ids, qrels, k)
        recalls.append(recall)
        fscore = evaluator.meanFscore(
            doc_IDs_ordered, query_ids, qrels, k)
        fscores.append(fscore)

        MAP = evaluator.meanAveragePrecision(
            doc_IDs_ordered, query_ids, qrels, k)
        MAPs.append(MAP)
        nDCG = evaluator.meanNDCG(
            doc_IDs_ordered, query_ids, qrels, k)
        nDCGs.append(nDCG)
        if (verbose):
            print("Precision, Recall and F-score @ " +  
                str(k) + " : " + str(precision) + ", " + str(recall) + 
                ", " + str(fscore))
            print("MAP, nDCG @ " +  
                str(k) + " : " + str(MAP) + ", " + str(nDCG))
        if (save_results > 0):
        # saving the results
            with open(op_folder+'Results/LSA_'+str(n_comp)+'.txt', 'a') as f:
                f.write(str(k) + " , " + str(precision) + ", " + str(recall) + 
                        ", " + str(fscore)+", "+str(MAP) + ", " + str(nDCG)+'\n')
            with open(op_folder+'Results/metrics_'+str(k)+'.txt', 'a') as f:
                f.write(str(n_comp) + " , " + str(precision) + ", " + str(recall) + 
                        ", " + str(fscore)+", "+str(MAP) + ", " + str(nDCG)+'\n')
            
    # Plot the metrics and save plot 
    if (save_results > 1):
        plt.figure()
        plt.plot(range(1, 11), precisions, label="Precision")
        plt.plot(range(1, 11), recalls, label="Recall")
        plt.plot(range(1, 11), fscores, label="F-Score")
        plt.plot(range(1, 11), MAPs, label="MAP")
        plt.plot(range(1, 11), nDCGs, label="nDCG")
        plt.legend()
        plt.title("Evaluation Metrics - LSA "+str(n_comp))
        plt.xlabel("k")
        plt.savefig(op_folder + "Plots/LSA_"+str(n_comp)+".png")

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

Total number of docs and queries included are 1625 and Vocabulary size is 1648


Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10,...,doc_1616,doc_1617,doc_1618,doc_1619,doc_1620,doc_1621,doc_1622,doc_1623,doc_1624,doc_1625
ablate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ablating,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ablation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ablative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
absence,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.605634,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_df.shape

In [12]:
tf_idf_docs = tfidf_df[columns[:len(docs_df)]].values
vec_rep_queries = tfidf_df[columns[len(docs_df):]].values

In [13]:
vec_rep_queries.shape

(1648, 225)

In [14]:
cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_queries )
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()

In [15]:
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

Precision, Recall and F-score @ 1 : 0.5911111111111111, 0.09802920317566409, 0.16210530863472036
MAP, nDCG @ 1 : 0.5911111111111111, 0.4118518518518518
Precision, Recall and F-score @ 2 : 0.4822222222222222, 0.15735581303439025, 0.22459347213488212
MAP, nDCG @ 2 : 0.6466666666666666, 0.33368226373889265
Precision, Recall and F-score @ 3 : 0.4414814814814813, 0.20782118034681657, 0.2651499335857847
MAP, nDCG @ 3 : 0.6614814814814814, 0.32825103615919604
Precision, Recall and F-score @ 4 : 0.3877777777777778, 0.23854025406062707, 0.2759486966876725
MAP, nDCG @ 4 : 0.6591358024691358, 0.3247484320221337
Precision, Recall and F-score @ 5 : 0.35288888888888925, 0.2651030491116788, 0.2816126363966719
MAP, nDCG @ 5 : 0.6521111111111109, 0.326260758367758
Precision, Recall and F-score @ 6 : 0.317037037037037, 0.28248630120339324, 0.27785442945229755
MAP, nDCG @ 6 : 0.6453419753086421, 0.3262984976408044
Precision, Recall and F-score @ 7 : 0.2946031746031749, 0.3031342276311141, 0.2779106545963

# Using Brown Corpus

In [None]:
brown_corpus_docs = pd.read_csv('./New Corpus/Brown_Corpus_Extracted.csv')
brown_corpus_docs['preprocessed'] = brown_corpus_docs["docs"].str.lower()
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(clean_text)
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda text: remove_punctuation(text))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda text: remove_stopwords(text))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda text: lemmatize_words(text))

In [None]:
new_corpus = brown_corpus_docs['preprocessed'].tolist()
total_corpus =  docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist() + new_corpus

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(total_corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

In [None]:
tf_idf_docs = tfidf_df[columns[:len(docs_df['preprocessed'].tolist())]].values
vec_rep_queries = tfidf_df[columns[len(docs_df['preprocessed'].tolist()):len(docs_df['preprocessed'].tolist()) + \
                                   len(query_df['preprocessed'].tolist())]].values

In [None]:
cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_queries )
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

# LSA without brown corpus

In [None]:
corpus = docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
#tfidf_df.head()

In [None]:
tf_idf_docs = tfidf_df[columns[:len(docs_df)]].values
vec_rep_queries = tfidf_df[columns[len(docs_df):]].values

In [None]:
components_used = [500]
for n_comp in components_used:
    svd = TruncatedSVD(n_components=n_comp)
    svd.fit(tf_idf_docs.T)
    tr_docs = svd.transform(tf_idf_docs.T).T
    qr_tr = svd.transform(vec_rep_queries.T).T
    cosine_sim = np.matmul(tr_docs.T, qr_tr )
    doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    #print("\nLSA with "+str(n_comp)+" in progress\n")
    #Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

In [None]:
query_num = 3
aaa = true_doc_Ids[true_doc_Ids['query_num'] == query_num]['id'].values.tolist()
bbb = doc_IDs_ordered[query_num-1]
average_prec = 0
for k in range(1,11):
    average_prec +=queryPrecision(bbb,aaa,k)/10
print('Average Precision : ',average_prec)

In [None]:
aaa

In [None]:
bbb[:11]

# LSA with brown corpus

In [None]:
new_corpus = brown_corpus_docs['preprocessed'].tolist()
total_corpus =  docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist() + new_corpus

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(total_corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

In [None]:
tf_idf_docs = tfidf_df[columns[:len(docs_df['preprocessed'].tolist())]].values
vec_rep_queries = tfidf_df[columns[len(docs_df['preprocessed'].tolist()):len(docs_df['preprocessed'].tolist()) + \
                                   len(query_df['preprocessed'].tolist())]].values


In [None]:
components_used = [20, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
for n_comp in components_used:
    svd = TruncatedSVD(n_components=n_comp)
    svd.fit(tf_idf_docs.T)
    tr_docs = svd.transform(tf_idf_docs.T).T
    qr_tr = svd.transform(vec_rep_queries.T).T
    cosine_sim = np.matmul(tr_docs.T, qr_tr )
    doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    print("\nLSA with "+str(n_comp)+" in progress\n")
    Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)