In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from util import *
import time
from evaluation import Evaluation
import string
#from gensim.parsing.preprocessing import STOPWORDS
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

In [2]:
# Read queries
queries_json = json.load(open( ".\cranfield\cran_queries.json", 'r'))[:]
query_ids, queries = [item["query number"] for item in queries_json], \
                        [item["query"] for item in queries_json]

# Read documents
docs_json = json.load(open(".\cranfield\cran_docs.json", 'r'))[:]
doc_ids, docs = [item["id"] for item in docs_json], \
                        [item["body"] for item in docs_json]

qrels = json.load(open( "./cranfield/cran_qrels.json", 'r'))[:]


In [3]:
docs_df = pd.DataFrame(docs_json).drop(['author','bibliography','id','title'],axis=1)
docs_df.head()

Unnamed: 0,body
0,experimental investigation of the aerodynamics...
1,simple shear flow past a flat plate in an inco...
2,the boundary layer in simple shear flow past a...
3,approximate solutions of the incompressible la...
4,one-dimensional transient heat conduction into...


# Preprocessing

In [4]:
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"she'll", "she will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[^a-z]", " ", text) # This removes anything other than lower case letters(very imp)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    PUNCT_TO_REMOVE = string.punctuation + '“' + '”'+'’' + '_'
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [5]:
query_df = pd.DataFrame(queries,columns = ['query'])

In [6]:
docs_df['preprocessed'] = docs_df['body'].str.lower()
query_df['preprocessed'] = query_df['query'].str.lower()

docs_df['preprocessed'] = docs_df['preprocessed'].apply(clean_text)
query_df['preprocessed'] = query_df['preprocessed'].apply(clean_text)

docs_df['preprocessed'] = docs_df['preprocessed'].apply(lambda text: remove_punctuation(text))
query_df['preprocessed'] = query_df['preprocessed'].apply(lambda text: remove_punctuation(text))

docs_df['preprocessed'] = docs_df['preprocessed'].apply(lambda text: remove_stopwords(text))
query_df['preprocessed'] = query_df['preprocessed'].apply(lambda text: remove_stopwords(text))

docs_df['preprocessed'] = docs_df['preprocessed'].apply(lambda text: lemmatize_words(text))
query_df['preprocessed'] = query_df['preprocessed'].apply(lambda text: lemmatize_words(text))

In [7]:
docs_df.head()

Unnamed: 0,body,preprocessed
0,experimental investigation of the aerodynamics...,experimental investigation aerodynamics wing s...
1,simple shear flow past a flat plate in an inco...,simple shear flow past flat plate incompressib...
2,the boundary layer in simple shear flow past a...,boundary layer simple shear flow past flat pla...
3,approximate solutions of the incompressible la...,approximate solution incompressible laminar bo...
4,one-dimensional transient heat conduction into...,one dimensional transient heat conduction doub...


In [8]:
corpus = docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist()

# TF-IDF and Evaluation

In [9]:
evaluator = Evaluation()
def Evaluation_metrics(doc_IDs_ordered, query_ids, qrels, n_comp, op_folder = './',save_results = 2, verbose = 1):
    """
    save_results : 0    ===> don't save anything
                 : 1    ===> just save results
                 : > 2  ===> save plots also
    """
    precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], []
    for k in range(1,11):
        precision = evaluator.meanPrecision(
            doc_IDs_ordered, query_ids, qrels, k)
        precisions.append(precision)
        recall = evaluator.meanRecall(
            doc_IDs_ordered, query_ids, qrels, k)
        recalls.append(recall)
        fscore = evaluator.meanFscore(
            doc_IDs_ordered, query_ids, qrels, k)
        fscores.append(fscore)

        MAP = evaluator.meanAveragePrecision(
            doc_IDs_ordered, query_ids, qrels, k)
        MAPs.append(MAP)
        nDCG = evaluator.meanNDCG(
            doc_IDs_ordered, query_ids, qrels, k)
        nDCGs.append(nDCG)
        if (verbose):
            print("Precision, Recall and F-score @ " +  
                str(k) + " : " + str(precision) + ", " + str(recall) + 
                ", " + str(fscore))
            print("MAP, nDCG @ " +  
                str(k) + " : " + str(MAP) + ", " + str(nDCG))
        if (save_results > 0):
        # saving the results
            with open(op_folder+'Results/LSA_'+str(n_comp)+'.txt', 'a') as f:
                f.write(str(k) + " , " + str(precision) + ", " + str(recall) + 
                        ", " + str(fscore)+", "+str(MAP) + ", " + str(nDCG)+'\n')
            with open(op_folder+'Results/metrics_'+str(k)+'.txt', 'a') as f:
                f.write(str(n_comp) + " , " + str(precision) + ", " + str(recall) + 
                        ", " + str(fscore)+", "+str(MAP) + ", " + str(nDCG)+'\n')
            
    # Plot the metrics and save plot 
    if (save_results > 1):
        plt.figure()
        plt.plot(range(1, 11), precisions, label="Precision")
        plt.plot(range(1, 11), recalls, label="Recall")
        plt.plot(range(1, 11), fscores, label="F-Score")
        plt.plot(range(1, 11), MAPs, label="MAP")
        plt.plot(range(1, 11), nDCGs, label="nDCG")
        plt.legend()
        plt.title("Evaluation Metrics - LSA "+str(n_comp))
        plt.xlabel("k")
        plt.savefig(op_folder + "Plots/LSA_"+str(n_comp)+".png")

In [143]:
TfidfVectorizer?

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (2,2))
X = vectorizer.fit_transform(corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

Total number of docs and queries included are 1625 and Vocabulary size is 74831


Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10,...,doc_1616,doc_1617,doc_1618,doc_1619,doc_1620,doc_1621,doc_1622,doc_1623,doc_1624,doc_1625
ab corresponds,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abbreviated form,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability convert,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability evaluate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability structure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
tfidf_df.shape

(74831, 1625)

In [147]:
tf_idf_docs = tfidf_df[columns[:len(docs_df)]].values
vec_rep_queries = tfidf_df[columns[len(docs_df):]].values

In [148]:
vec_rep_queries.shape

(74831, 225)

In [149]:
cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_queries )
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()

In [65]:
def queryPrecision(query_doc_IDs_ordered, true_doc_IDs, k):
    
    precision = -1
    
    relevance = np.zeros((len(query_doc_IDs_ordered),1))
    for i in range(len(query_doc_IDs_ordered)):
        if query_doc_IDs_ordered[i] in true_doc_IDs:
            relevance[i] = 1

    precision = relevance[:k].sum()/k

    return precision

In [66]:
true_doc_Ids = pd.read_json(r'''C:\Users\Sandeep's\Desktop\NLP\NLP Project\cranfield\cran_qrels.json''')
true_doc_Ids.head()

Unnamed: 0,query_num,position,id
0,1,2,184
1,1,2,29
2,1,2,31
3,1,3,12
4,1,3,51


In [112]:
def run(number):
    corpus = docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist()
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    #print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

    feature_names = vectorizer.get_feature_names()
    columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

    tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
    
    tf_idf_docs = tfidf_df[columns[:len(docs_df)]].values
    vec_rep_queries = tfidf_df[columns[len(docs_df):]].values
    
    components_used = [500]
    for n_comp in components_used:
        svd = TruncatedSVD(n_components=n_comp)
        svd.fit(tf_idf_docs.T)
        tr_docs = svd.transform(tf_idf_docs.T).T
        qr_tr = svd.transform(vec_rep_queries.T).T
        cosine_sim = np.matmul(tr_docs.T, qr_tr )
        doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    
    query_num = number
    aaa = true_doc_Ids[true_doc_Ids['query_num'] == query_num]['id'].values.tolist()
    bbb = doc_IDs_ordered[query_num-1]
    average_prec = 0
    for k in range(1,11):
        average_prec +=queryPrecision(bbb,aaa,k)/10
    print("With LSA ")
    print('Average Precision : ',average_prec)
    print("True Docs : ", aaa)
    print("Predicted Docs : ", bbb[:len(aaa)])
    
def run2(number):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    #print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

    feature_names = vectorizer.get_feature_names()
    columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

    tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
    
    tf_idf_docs = tfidf_df[columns[:len(docs_df)]].values
    vec_rep_queries = tfidf_df[columns[len(docs_df):]].values
    
    cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_queries )
    doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    query_num = number
    aaa = true_doc_Ids[true_doc_Ids['query_num'] == query_num]['id'].values.tolist()
    bbb = doc_IDs_ordered[query_num-1]
    average_prec = 0
    for k in range(1,11):
        average_prec +=queryPrecision(bbb,aaa,k)/10
    print("Without LSA")
    print('Average Precision : ',average_prec)
    print("True Docs : ", aaa)
    print("Predicted Docs : ", bbb[:len(aaa)])

In [None]:
# query num is 16, 115(0.2574), 127, 202,

In [139]:
queries = pd.read_json(r'''C:\Users\Sandeep's\Desktop\NLP\NLP Project\cranfield\cran_queries.json''')
query_num = 15
print("Given Query is : ")
print(queries.iloc[query_num]['query'])
run2(query_num)
run(query_num)

Given Query is : 
can the transverse potential flow about a body of revolution be calculated efficiently by an electronic computer .
Without LSA
Average Precision :  0.3407936507936508
True Docs :  [463, 462, 497]
Predicted Docs :  [462, 1097, 761]
With LSA 
Average Precision :  0.4857936507936508
True Docs :  [463, 462, 497]
Predicted Docs :  [462, 463, 1097]


In [150]:
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

Precision, Recall and F-score @ 1 : 0.5866666666666667, 0.10058501128501127, 0.16536591222865724
MAP, nDCG @ 1 : 0.5866666666666667, 0.45481481481481484
Precision, Recall and F-score @ 2 : 0.49777777777777776, 0.1599801332683899, 0.22869352394819562
MAP, nDCG @ 2 : 0.6466666666666666, 0.36084622456323123
Precision, Recall and F-score @ 3 : 0.42222222222222217, 0.20095896439894526, 0.2541865585439782
MAP, nDCG @ 3 : 0.6540740740740741, 0.33615230372499894
Precision, Recall and F-score @ 4 : 0.37555555555555553, 0.23388413002289393, 0.26747136865488386
MAP, nDCG @ 4 : 0.6491358024691358, 0.3335926426830179


KeyboardInterrupt: 

# Using Brown Corpus

In [18]:
brown_corpus_docs = pd.read_csv('./New Corpus/Brown_Corpus_Extracted.csv')
brown_corpus_docs['preprocessed'] = brown_corpus_docs["docs"].str.lower()
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(clean_text)
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda text: remove_punctuation(text))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda text: remove_stopwords(text))
brown_corpus_docs['preprocessed'] = brown_corpus_docs['preprocessed'].apply(lambda text: lemmatize_words(text))

In [19]:
new_corpus = brown_corpus_docs['preprocessed'].tolist()
total_corpus =  docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist() + new_corpus

In [21]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(total_corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

Total number of docs and queries included are 2125 and Vocabulary size is 35174


Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10,...,doc_2116,doc_2117,doc_2118,doc_2119,doc_2120,doc_2121,doc_2122,doc_2123,doc_2124,doc_2125
aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaawww,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
tf_idf_docs = tfidf_df[columns[:len(docs_df['preprocessed'].tolist())]].values
vec_rep_queries = tfidf_df[columns[len(docs_df['preprocessed'].tolist()):len(docs_df['preprocessed'].tolist()) + \
                                   len(query_df['preprocessed'].tolist())]].values

In [25]:
cosine_sim = np.matmul(tf_idf_docs.T, vec_rep_queries )
doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

Precision, Recall and F-score @ 1 : 0.68, 0.11469949720147925, 0.18793485516574437
MAP, nDCG @ 1 : 0.68, 0.517037037037037
Precision, Recall and F-score @ 2 : 0.5555555555555556, 0.1809491463303036, 0.25610825912266516
MAP, nDCG @ 2 : 0.72, 0.4060708358444486
Precision, Recall and F-score @ 3 : 0.5022222222222225, 0.23794555695356745, 0.2999380233125523
MAP, nDCG @ 3 : 0.7233333333333335, 0.39042744747760777
Precision, Recall and F-score @ 4 : 0.45444444444444443, 0.2840794454821928, 0.3234687634565549
MAP, nDCG @ 4 : 0.7117283950617282, 0.3840543067186023
Precision, Recall and F-score @ 5 : 0.4115555555555559, 0.3141430140974857, 0.3287286078101979
MAP, nDCG @ 5 : 0.7087839506172837, 0.3847481480060724
Precision, Recall and F-score @ 6 : 0.3785185185185182, 0.3412518145200115, 0.3309532723449875
MAP, nDCG @ 6 : 0.6998814814814813, 0.39140010703424105
Precision, Recall and F-score @ 7 : 0.35301587301587345, 0.36603255904781495, 0.331303201990575
MAP, nDCG @ 7 : 0.6890758377425041, 0.39

# LSA without brown corpus

In [71]:
corpus = docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
#tfidf_df.head()

Total number of docs and queries included are 1625 and Vocabulary size is 5604


Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10,...,doc_1616,doc_1617,doc_1618,doc_1619,doc_1620,doc_1621,doc_1622,doc_1623,doc_1624,doc_1625
ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abbreviated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ablate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ablation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
tf_idf_docs = tfidf_df[columns[:len(docs_df)]].values
vec_rep_queries = tfidf_df[columns[len(docs_df):]].values

In [73]:
components_used = [500]
for n_comp in components_used:
    svd = TruncatedSVD(n_components=n_comp)
    svd.fit(tf_idf_docs.T)
    tr_docs = svd.transform(tf_idf_docs.T).T
    qr_tr = svd.transform(vec_rep_queries.T).T
    cosine_sim = np.matmul(tr_docs.T, qr_tr )
    doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    #print("\nLSA with "+str(n_comp)+" in progress\n")
    #Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)

In [74]:
query_num = 3
aaa = true_doc_Ids[true_doc_Ids['query_num'] == query_num]['id'].values.tolist()
bbb = doc_IDs_ordered[query_num-1]
average_prec = 0
for k in range(1,11):
    average_prec +=queryPrecision(bbb,aaa,k)/10
print('Average Precision : ',average_prec)

Average Precision :  0.9688888888888888


In [75]:
aaa

[5, 6, 90, 91, 119, 144, 181, 399, 485]

In [76]:
bbb[:11]

[485, 5, 90, 91, 144, 181, 399, 6, 582, 707, 579]

# LSA with brown corpus

In [39]:
new_corpus = brown_corpus_docs['preprocessed'].tolist()
total_corpus =  docs_df['preprocessed'].tolist() + query_df['preprocessed'].tolist() + new_corpus

In [40]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(total_corpus)
print("Total number of docs and queries included are {} and Vocabulary size is {}".format(X.shape[0],X.shape[1]))

feature_names = vectorizer.get_feature_names()
columns = ['doc_' + str(i) for i in range(1,X.shape[0]+1)]

tfidf_df = pd.DataFrame(X.T.todense(), index=feature_names, columns=columns)
tfidf_df.head()

Total number of docs and queries included are 2125 and Vocabulary size is 35174


Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10,...,doc_2116,doc_2117,doc_2118,doc_2119,doc_2120,doc_2121,doc_2122,doc_2123,doc_2124,doc_2125
aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaawww,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
tf_idf_docs = tfidf_df[columns[:len(docs_df['preprocessed'].tolist())]].values
vec_rep_queries = tfidf_df[columns[len(docs_df['preprocessed'].tolist()):len(docs_df['preprocessed'].tolist()) + \
                                   len(query_df['preprocessed'].tolist())]].values


In [42]:
components_used = [20, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
for n_comp in components_used:
    svd = TruncatedSVD(n_components=n_comp)
    svd.fit(tf_idf_docs.T)
    tr_docs = svd.transform(tf_idf_docs.T).T
    qr_tr = svd.transform(vec_rep_queries.T).T
    cosine_sim = np.matmul(tr_docs.T, qr_tr )
    doc_IDs_ordered = (np.argsort(cosine_sim,axis=0)+1)[::-1].T.tolist()
    print("\nLSA with "+str(n_comp)+" in progress\n")
    Evaluation_metrics(doc_IDs_ordered, query_ids, qrels,n_comp = 0,save_results=0)


LSA with 20 in progress

Precision, Recall and F-score @ 1 : 0.2222222222222222, 0.029681586509656677, 0.05064216963776479
MAP, nDCG @ 1 : 0.2222222222222222, 0.12888888888888886
Precision, Recall and F-score @ 2 : 0.19555555555555557, 0.05055103577384276, 0.076498848167504
MAP, nDCG @ 2 : 0.26222222222222225, 0.10247873896931342
Precision, Recall and F-score @ 3 : 0.19407407407407404, 0.07661072916686949, 0.10347668450628146
MAP, nDCG @ 3 : 0.28592592592592597, 0.10727330922052095
Precision, Recall and F-score @ 4 : 0.18333333333333332, 0.09939878243531489, 0.1202169358967111
MAP, nDCG @ 4 : 0.29629629629629634, 0.11024132036007335
Precision, Recall and F-score @ 5 : 0.17866666666666658, 0.11881537753230213, 0.1326191564466702
MAP, nDCG @ 5 : 0.29373456790123464, 0.1137841294724282
Precision, Recall and F-score @ 6 : 0.18222222222222223, 0.1463252777036058, 0.1501491844095761
MAP, nDCG @ 6 : 0.29632345679012345, 0.12315144983791576
Precision, Recall and F-score @ 7 : 0.17079365079365

Precision, Recall and F-score @ 4 : 0.45444444444444443, 0.28227135358660943, 0.3218338386345905
MAP, nDCG @ 4 : 0.7240740740740742, 0.3869994648376415
Precision, Recall and F-score @ 5 : 0.41688888888888914, 0.31672965331157577, 0.33218082758522466
MAP, nDCG @ 5 : 0.7165740740740738, 0.387041793307716
Precision, Recall and F-score @ 6 : 0.3792592592592592, 0.34001766159958413, 0.3306328724013991
MAP, nDCG @ 6 : 0.7071320987654317, 0.39055279808287396
Precision, Recall and F-score @ 7 : 0.35873015873015923, 0.37281007905866814, 0.3373849886964982
MAP, nDCG @ 7 : 0.6918432098765427, 0.4007053951950559
Precision, Recall and F-score @ 8 : 0.33611111111111114, 0.3935193777483591, 0.3350192927171557
MAP, nDCG @ 8 : 0.680068027210884, 0.4057154441613624
Precision, Recall and F-score @ 9 : 0.3165432098765435, 0.410762457471831, 0.33072392642825954
MAP, nDCG @ 9 : 0.6684863378684806, 0.4105727012805075
Precision, Recall and F-score @ 10 : 0.2955555555555557, 0.42270200330611374, 0.322117895657

Precision, Recall and F-score @ 8 : 0.3322222222222222, 0.3887540570391787, 0.3306944887492711
MAP, nDCG @ 8 : 0.6745015873015865, 0.4030193910126372
Precision, Recall and F-score @ 9 : 0.31111111111111134, 0.4036597149252288, 0.3247102434501951
MAP, nDCG @ 9 : 0.6639905769715287, 0.40739808835530494
Precision, Recall and F-score @ 10 : 0.2951111111111114, 0.42345226936950736, 0.3219099602541888
MAP, nDCG @ 10 : 0.656919131603258, 0.41259923374088187

LSA with 1000 in progress

Precision, Recall and F-score @ 1 : 0.6844444444444444, 0.11620021870220075, 0.19016765939854866
MAP, nDCG @ 1 : 0.6844444444444444, 0.5155555555555557
Precision, Recall and F-score @ 2 : 0.56, 0.1832359745118686, 0.25902360303800903
MAP, nDCG @ 2 : 0.72, 0.4086871122531789
Precision, Recall and F-score @ 3 : 0.5051851851851853, 0.24283656224457276, 0.30456941661061226
MAP, nDCG @ 3 : 0.7255555555555557, 0.39108102752762924
Precision, Recall and F-score @ 4 : 0.45555555555555555, 0.28467246974188376, 0.324074092