In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
!unzip /content/drive/MyDrive/Datasets/cranfield.zip

Archive:  /content/drive/MyDrive/Datasets/cranfield.zip
  inflating: cranfield/.DS_Store     
  inflating: cranfield/cran_docs.json  
  inflating: cranfield/cran_qrels.json  
  inflating: cranfield/cran_queries.json  
  inflating: cranfield/README.txt    


In [4]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/16/5a/23ed3132063a0684ea66fb410260c71c4ffda3b99f8f1c021d1e245401b5/rank_bm25-0.2.1-py3-none-any.whl
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.1


In [5]:
#@title Python libraries used in the project
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
from nltk.tokenize import word_tokenize
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import json
from rank_bm25 import BM25Okapi
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
#@title Code for data preprocessing
def data_preprocess(doc):
  tokens = word_tokenize(doc)
  # convert to lower case 
  tokens = [w.lower() for w in tokens]
  # remove punctuation from each word
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  words = [word for word in stripped if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  # stemming of words
  porter = PorterStemmer()
  stemmed = [porter.stem(word) for word in words]
  return stemmed

In [7]:
#@title Code for evaluation
import math
import os

# Add your import statements here
from collections import defaultdict



class Evaluation():
    def __init__(self):
      self.qr=None #This dictionary holds the list of relevant documents for a particular query
    def build_qr(self,q_rels):
      #This function populates the self.qr dictionary which has relevant document lists as values and
      #query ids as keys of the dictionary
      qr = defaultdict(list)
      for dc in q_rels:
        qr[int(dc['query_num'])].append(int(dc['id']))
      self.qr=qr
    def get_docid(self,query_id,qrels):
      result = list(filter(lambda query: query['query_num'] == str(query_id), qrels))
      doc_id = []
      for d in result:
        for k, v in d.items():
          if k == "id":
            doc_id.append(v)
      return doc_id

    def get_rel_score(self, query_id,element, qrels):
      result = list(filter(lambda query: query['query_num'] == str(query_id), qrels))
      doc_id = []
      rel_score = []
      for d in result:
        for k, v in d.items():
          if k == "id":
            doc_id.append(v)
          if k == "position":
            # Measuring the relevance score in a scale of 0-3 where integer with higher value has greater relevance
            rel_score.append(4-int(v))

      relavant_score = rel_score[doc_id.index(element)]
      return relavant_score

    def queryPrecision(self, query_doc_IDs_ordered, query_id, true_doc_IDs, k):
      """
      Computation of precision of the Information Retrieval System
      at a given value of k for a single query

      Parameters
      ----------
      arg1 : list
        A list of integers denoting the IDs of documents in
        their predicted order of relevance to a query
      arg2 : int
        The ID of the query in question
      arg3 : list
        The list of IDs of documents relevant to the query (ground truth)
      arg4 : int
        The k value

      Returns
      -------
      float
        The precision value as a number between 0 and 1
      """

      precision = 0

      #Fill in code here
      count = 1
      for element in query_doc_IDs_ordered:
        # Find the number of relevant documents in top k results
        if count <= k:
          # if the document is present in the relevant document list then increment precision count by 1.
          if str(element) in true_doc_IDs:
            precision = precision + 1
        count = count + 1
      if k== 0:
        precision = 1
      else:
        # Dividing precision by number of documents retrieved at rank 'k'
        precision = precision / k

      return precision


    def meanPrecision(self, doc_IDs_ordered, query_ids, qrels, k):
      """
      Computation of precision of the Information Retrieval System
      at a given value of k, averaged over all the queries

      Parameters
      ----------
      arg1 : list
        A list of lists of integers where the ith sub-list is a list of IDs
        of documents in their predicted order of relevance to the ith query
      arg2 : list
        A list of IDs of the queries for which the documents are ordered
      arg3 : list
        A list of dictionaries containing document-relevance
        judgements - Refer cran_qrels.json for the structure of each
        dictionary
      arg4 : int
        The k value

      Returns
      -------
      float
        The mean precision value as a number between 0 and 1
      """

      meanPrecision = -1

      #Fill in code here
      n = 0
      sum = 0
      for l in doc_IDs_ordered:
        if n==len(query_ids):break
        doc_id = self.get_docid(query_ids[n], qrels)
        sum = sum + self.queryPrecision(l, query_ids[n], doc_id, k)
        n = n + 1
      if len(doc_IDs_ordered) == 0:
        meanPrecision = 1
      else:
        # Finding the average of all precision values corresponding to different queries
        meanPrecision = sum / len(doc_IDs_ordered)
      return meanPrecision


    def queryRecall(self, query_doc_IDs_ordered, query_id, true_doc_IDs, k):
      """
      Computation of recall of the Information Retrieval System
      at a given value of k for a single query

      Parameters
      ----------
      arg1 : list
        A list of integers denoting the IDs of documents in
        their predicted order of relevance to a query
      arg2 : int
        The ID of the query in question
      arg3 : list
        The list of IDs of documents relevant to the query (ground truth)
      arg4 : int
        The k value

      Returns
      -------
      float
        The recall value as a number between 0 and 1
      """

      recall = 0

      #Fill in code here
      count = 1
      for element in query_doc_IDs_ordered:
        # Find the number of relevant documents in top k results
        if count <= k:
          # if the document is present in the relevant document list then increment recall count by 1.
          if str(element) in true_doc_IDs:
            recall = recall + 1
        count = count + 1
      if len(true_doc_IDs) == 0:
        recall = 1
      else:
        # Dividing recall by number of relevant documents
        recall = recall / len(true_doc_IDs)

      return recall


    def meanRecall(self, doc_IDs_ordered, query_ids, qrels, k):
      """
      Computation of recall of the Information Retrieval System
      at a given value of k, averaged over all the queries

      Parameters
      ----------
      arg1 : list
        A list of lists of integers where the ith sub-list is a list of IDs
        of documents in their predicted order of relevance to the ith query
      arg2 : list
        A list of IDs of the queries for which the documents are ordered
      arg3 : list
        A list of dictionaries containing document-relevance
        judgements - Refer cran_qrels.json for the structure of each
        dictionary
      arg4 : int
        The k value

      Returns
      -------
      float
        The mean recall value as a number between 0 and 1
      """

      meanRecall = -1

      #Fill in code here
      n = 0
      sum = 0
      for l in doc_IDs_ordered:
        if n==len(query_ids):break
        doc_id = self.get_docid(query_ids[n], qrels)
        sum = sum + self.queryRecall(l,query_ids[n],doc_id,k)
        n = n + 1
      if len(doc_IDs_ordered) == 0:
        meanRecall = 1
      else:
        # Finding the average of all recall values corresponding to different queries
        meanRecall = sum / len(doc_IDs_ordered)
      return meanRecall

    def queryFscore(self, query_doc_IDs_ordered, query_id, true_doc_IDs, k):
      """
      Computation of fscore of the Information Retrieval System
      at a given value of k for a single query

      Parameters
      ----------
      arg1 : list
        A list of integers denoting the IDs of documents in
        their predicted order of relevance to a query
      arg2 : int
        The ID of the query in question
      arg3 : list
        The list of IDs of documents relevant to the query (ground truth)
      arg4 : int
        The k value

      Returns
      -------
      float
        The fscore value as a number between 0 and 1
      """

      fscore = -1

      #Fill in code here
      precision = self.queryPrecision(query_doc_IDs_ordered, query_id, true_doc_IDs, k)
      recall = self.queryRecall(query_doc_IDs_ordered, query_id, true_doc_IDs, k)
      # This will happen when there are no relevant documents in top k results
      if precision + recall == 0:
        fscore = 0
      else:
        fscore = (2 * precision * recall) / (precision + recall)

      return fscore

    def meanFscore(self, doc_IDs_ordered, query_ids, qrels, k):
      """
      Computation of fscore of the Information Retrieval System
      at a given value of k, averaged over all the queries

      Parameters
      ----------
      arg1 : list
        A list of lists of integers where the ith sub-list is a list of IDs
        of documents in their predicted order of relevance to the ith query
      arg2 : list
        A list of IDs of the queries for which the documents are ordered
      arg3 : list
        A list of dictionaries containing document-relevance
        judgements - Refer cran_qrels.json for the structure of each
        dictionary
      arg4 : int
        The k value

      Returns
      -------
      float
        The mean fscore value as a number between 0 and 1
      """

      meanFscore = -1

      # Fill in code here
      n = 0
      sum = 0
      for l in doc_IDs_ordered:
        if n==len(query_ids):break
        doc_id = self.get_docid(query_ids[n], qrels)
        sum = sum + self.queryFscore(l, query_ids[n], doc_id, k)
        n = n + 1
      # Finding the average of all fscore values corresponding to different queries
      meanFscore = sum / len(doc_IDs_ordered)
      return meanFscore


    def queryNDCG(self, query_doc_IDs_ordered, query_id, true_doc_IDs, k):
      """
      Computation of nDCG of the Information Retrieval System
      at given value of k for a single query

      Parameters
      ----------
      arg1 : list
        A list of integers denoting the IDs of documents in
        their predicted order of relevance to a query
      arg2 : int
        The ID of the query in question
      arg3 : list
        The list of IDs of documents relevant to the query (ground truth)
      arg4 : int
        The k value

      Returns
      -------
      float
        The nDCG value as a number between 0 and 1
      """

      nDCG = -1

      #Fill in code here
      DCG = 0
      IDCG = 0
      #path=os.path.join(os.getcwd(),"cranfield","cran_qrels.json")
      qrels = json.load(open('/content/cranfield/cran_qrels.json', 'r'))[:]
      count = 1
      for element in query_doc_IDs_ordered:
        if count <= k:
          if str(element) in true_doc_IDs:
            DCG = DCG + (self.get_rel_score(query_id, str(element), qrels) / math.log2(query_doc_IDs_ordered.index(element) + 2))
          count = count + 1
      count = 1
      for element in true_doc_IDs:
        if count <= k:
          IDCG = IDCG + (self.get_rel_score(query_id,element,qrels) / math.log2(count + 1))
          count = count + 1
      # When there are no documents in the ideally relevant to the query then there are no documents to display.
      # Thus the relevance score is 1.
      if IDCG == 0:
        IDCG = 1
      nDCG = DCG / IDCG
      return nDCG


    def meanNDCG(self, doc_IDs_ordered, query_ids, qrels, k):
      """
      Computation of nDCG of the Information Retrieval System
      at a given value of k, averaged over all the queries

      Parameters
      ----------
      arg1 : list
        A list of lists of integers where the ith sub-list is a list of IDs
        of documents in their predicted order of relevance to the ith query
      arg2 : list
        A list of IDs of the queries for which the documents are ordered
      arg3 : list
        A list of dictionaries containing document-relevance
        judgements - Refer cran_qrels.json for the structure of each
        dictionary
      arg4 : int
        The k value

      Returns
      -------
      float
        The mean nDCG value as a number between 0 and 1
      """

      meanNDCG = -1

      #Fill in code here
      n = 0
      sum = 0
      for l in doc_IDs_ordered:
        if n==len(query_ids): break
        true_doc_IDs = self.get_docid(query_ids[n], qrels)
        nDCG=self.queryNDCG(l, query_ids[n], true_doc_IDs, k)
        sum = sum + nDCG
        n = n + 1
      # Finding the average of all nDCG values corresponding to different queries
      meanNDCG = sum/len(doc_IDs_ordered)
      return meanNDCG


    def queryAveragePrecision(self, query_doc_IDs_ordered, query_id, true_doc_IDs, k):
      """
      Computation of average precision of the Information Retrieval System
      at a given value of k for a single query (the average of precision@i
      values for i such that the ith document is truly relevant)

      Parameters
      ----------
      arg1 : list
        A list of integers denoting the IDs of documents in
        their predicted order of relevance to a query
      arg2 : int
        The ID of the query in question
      arg3 : list
        The list of documents relevant to the query (ground truth)
      arg4 : int
        The k value

      Returns
      -------
      float
        The average precision value as a number between 0 and 1
      """

      avgPrecision = -1

      #Fill in code here
      pdl=query_doc_IDs_ordered
      rdl=true_doc_IDs
      for j in range(len(pdl)):
        if pdl[j] in rdl:
          pdl[j] = 1
        else:
          pdl[j] = 0
      ap = 0
      for j in range(len(pdl)):
        ap += (sum(pdl[:(j + 1)]) * 1.0 / (j + 1))
      ap /= len(pdl)
      avgPrecision=ap
      return avgPrecision


    def meanAveragePrecision(self, doc_IDs_ordered, query_ids, q_rels, k):
      """
      Computation of MAP of the Information Retrieval System
      at given value of k, averaged over all the queries

      Parameters
      ----------
      arg1 : list
        A list of lists of integers where the ith sub-list is a list of IDs
        of documents in their predicted order of relevance to the ith query
      arg2 : list
        A list of IDs of the queries
      arg3 : list
        A list of dictionaries containing document-relevance
        judgements - Refer cran_qrels.json for the structure of each
        dictionary
      arg4 : int
        The k value

      Returns
      -------
      float
        The MAP value as a number between 0 and 1
      """

      meanAveragePrecision = -1

      #Fill in code here
      MAP = 0
      # preprocess q_rels
      self.build_qr(q_rels)
      qr = self.qr

      for i in range(len(query_ids)):
        # print(len(doc_IDs_ordered_all),len(query_ids))
        pdl = doc_IDs_ordered[i][:k]
        rdl = qr[int(query_ids[i])]
        #print(pdl[:10], rdl)
        for j in range(len(pdl)):
          if pdl[j] in rdl:
            pdl[j] = 1
          else:
            pdl[j] = 0
        ap = 0
        for j in range(len(pdl)):
          ap += (sum(pdl[:(j + 1)]) * 1.0 / (j + 1))
        ap /= len(pdl)
        MAP += ap
      MAP /= len(query_ids)
      meanAveragePrecision=MAP


      return meanAveragePrecision



In [8]:
#@title Function for old method
def train_old(k):
  #without SVD
  import json
  dj = json.load(open('/content/cranfield/cran_docs.json', 'r'))
  docs,dids=[item['body'] for item in dj],[item['id'] for item in dj]
  qj = json.load(open('/content/cranfield/cran_queries.json', 'r'))
  qrs,qids=[item['query'] for item in qj],[item['query number'] for item in qj]
  #print(len(dids),len(qids))
  qrels=json.load(open('/content/cranfield/cran_qrels.json', 'r'))
  eval=Evaluation()
  vectorizer = TfidfVectorizer(tokenizer=data_preprocess,stop_words=None, 
                             use_idf=True, 
                             smooth_idf=True,max_df=0.6)
  dvecs1=vectorizer.fit_transform(docs).todense()
  qvecs1=vectorizer.transform(qrs).todense()
  #print(dvecs1.shape,qvecs1.shape)
  B=sklearn.metrics.pairwise.cosine_similarity(qvecs1,dvecs1)

  pred_ids=np.zeros((255,k),dtype=int)
  for i in range(0,225):
    x=np.argsort(B[i]).T[::-1][:k]
    j=0
    for y in x:
      pred_ids[i][j]=dids[y]
      j+=1
  pdl=pred_ids.tolist()

  results1=[]
  for i in range(20):
    s=i*10
    e=s+10
    results1.append([eval.meanAveragePrecision(pdl[s:e],qids[s:e],qrels,k),eval.meanNDCG(pdl[s:e],qids[s:e],qrels,k),eval.meanPrecision(pdl[s:e],qids[s:e],qrels,k)])
  #print(results1)
  rf1=open('Tfidf_only_cosineSim.csv','w')
  rf1.write('MAP,NDCG,MP\n')
  for x in results1:
    rf1.write(str(x[0])+','+str(x[1])+','+str(x[2])+'\n')
  rf1.close()
  print('Using old method- k: '+str(k)+' nDCG: '+str(eval.meanNDCG(pdl,qids,qrels,k)))
  return results1,eval.meanNDCG(pdl,qids,qrels,k),eval.meanAveragePrecision(pdl,qids,qrels,k),eval.meanRecall(pdl,qids,qrels,k),eval.meanFscore(pdl,qids,qrels,k)

  

In [9]:
#@title Function for new  method using LSA
def train_new(k):
  import json
  dj = json.load(open('/content/cranfield/cran_docs.json', 'r'))
  docs,dids=[item['body']+' '+item['title'] for item in dj],[item['id'] for item in dj]
  qj = json.load(open('/content/cranfield/cran_queries.json', 'r'))
  qrs,qids=[item['query'] for item in qj],[item['query number'] for item in qj]
  #print(len(dids),len(qids))
  #k=5
  vectorizer = TfidfVectorizer(tokenizer=data_preprocess,stop_words=None, 
                             use_idf=True, 
                             smooth_idf=True,max_df=0.9)
  svd_model = TruncatedSVD(n_components=325,         
                          algorithm='randomized',
                          n_iter=10)
  svd_transformer = Pipeline([('tfidf', vectorizer), 
                              ('svd', svd_model)])
  dvecs=svd_transformer.fit_transform(docs)
  from scipy.sparse import csr_matrix
  #qvecs_qe=np.dot(vectorizer.transform(qrs).todense(),wsim)
  #qvecs=np.dot(qvecs_qe,svd_model.components_.T)
  qvecs=svd_transformer.transform(qrs)
  #print(qvecs.shape)
  A=sklearn.metrics.pairwise.cosine_similarity(qvecs,dvecs)
  pred_ids=np.zeros((255,k),dtype=int)
  for i in range(0,225):
    x=np.argsort(A[i]).T[::-1][:k]
    j=0
    for y in x:
      pred_ids[i][j]=dids[y]
      j+=1
  pdl=pred_ids.tolist()
  qrels=json.load(open('/content/cranfield/cran_qrels.json', 'r'))
  eval=Evaluation()
  results=[]
  for i in range(20):
    s=i*10
    e=s+10
    results.append([eval.meanAveragePrecision(pdl[s:e],qids[s:e],qrels,k),eval.meanNDCG(pdl[s:e],qids[s:e],qrels,k),eval.meanPrecision(pdl[s:e],qids[s:e],qrels,k)])
  print('Using new method using LSA- k: '+str(k)+' nDCG: '+str(eval.meanNDCG(pdl,qids,qrels,k)))
  rf=open('SVD_cosineSim.csv','w')
  rf.write('MAP,NDCG,MP\n')
  for x in results:
    rf.write(str(x[0])+','+str(x[1])+','+str(x[2])+'\n')
  rf.close()
  return results,eval.meanNDCG(pdl,qids,qrels,k),eval.meanAveragePrecision(pdl,qids,qrels,k),eval.meanRecall(pdl,qids,qrels,k),eval.meanFscore(pdl,qids,qrels,k)


In [10]:
#@title Using both bm25 and LSA
def data_preprocess1(docs):
  doc_list=[]
  for every_doc in docs:
    tokens = word_tokenize(every_doc)
    # convert to lower case 
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stemming of words
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    doc_list.append(stemmed)
  return doc_list
def get_wsim():
  import json
  dj = json.load(open('/content/cranfield/cran_docs.json', 'r'))
  docs,dids=[item['body']+' '+item['title'] for item in dj],[item['id'] for item in dj]
  qj = json.load(open('/content/cranfield/cran_queries.json', 'r'))
  qrs,qids=[item['query'] for item in qj],[item['query number'] for item in qj]
  #print(len(dids),len(qids))
  #k=5
  vectorizer = TfidfVectorizer(tokenizer=data_preprocess,stop_words=None, 
                             use_idf=True, 
                             smooth_idf=True,max_df=0.9)
  svd_model = TruncatedSVD(n_components=325,         
                          algorithm='randomized',
                          n_iter=10)
  svd_transformer = Pipeline([('tfidf', vectorizer), 
                              ('svd', svd_model)])
  dvecs=svd_transformer.fit_transform(docs)
  wsim=sklearn.metrics.pairwise.cosine_similarity(svd_model.components_.T,svd_model.components_.T)
  return wsim,vectorizer

def train_mixed(k,k1=1,k2=1):
  import json
  dj = json.load(open('/content/cranfield/cran_docs.json', 'r'))
  docs,dids=[item['body']+' '+item['title'] for item in dj],[item['id'] for item in dj]
  qj = json.load(open('/content/cranfield/cran_queries.json', 'r'))
  qrs,qids=[item['query'] for item in qj],[item['query number'] for item in qj]
  tot_query = len(qids)
  pred_ids=np.zeros((tot_query,k),dtype=int)
  tokenized_corpus = data_preprocess1(docs)
  tokenized_query = data_preprocess1(qrs)
  bm25 = BM25Okapi(tokenized_corpus)
  vectorizer = TfidfVectorizer(tokenizer=data_preprocess,stop_words=None, 
                             use_idf=True, 
                             smooth_idf=True,max_df=0.9,ngram_range=(1,2))
  svd_model = TruncatedSVD(n_components=500,         
                          algorithm='randomized',
                          n_iter=10)
  svd_transformer = Pipeline([('tfidf', vectorizer), 
                              ('svd', svd_model)])
  dvecs=svd_transformer.fit_transform(docs)
  wsim,vectorizer=get_wsim()
  v=vectorizer.get_feature_names()
  
  stop_words = set(stopwords.words('english')) 
  import random
  for i in range(len(qrs)):
    new_qr=qrs[i]
    tkns=nltk.word_tokenize(qrs[i])
    random.shuffle(tkns)
    for x in tkns :
      k2=k2-1
      if k2<=0: break
      if x not in stop_words:
        try:
          new_qr+=(' '+' '.join(getsyn(x,k1+1,v,wsim)[1:]))
        except Exception:
          pass
    #if i<2:
      #print('old-',qrs[i])
      #print('new-',new_qr)
      #print('\n\n')
    qrs[i]=new_qr
  from scipy.sparse import csr_matrix
  #qvecs_qe=np.dot(vectorizer.transform(qrs).todense(),wsim)
  #qvecs=np.dot(qvecs_qe,svd_model.components_.T)
  qvecs=svd_transformer.transform(qrs)
  #print(qvecs.shape)
  A=sklearn.metrics.pairwise.cosine_similarity(qvecs,dvecs)
  A=(A-A.min(axis=1).reshape(-1,1))/A.max(axis=1).reshape(-1,1)
  pred_ids=np.zeros((255,k),dtype=int)
  alpha=60.0/(70+60.0)
  for i in range(0,tot_query):
    doc_scores = bm25.get_scores(tokenized_query[i])
    doc_scores = (doc_scores-doc_scores.min(axis=0))/doc_scores.max(axis=0)
    A[i]=alpha*A[i]+(1-alpha)*doc_scores.reshape(1,-1)
  for i in range(0,225):
    x=np.argsort(A[i]).T[::-1][:k]
    j=0
    for y in x:
      pred_ids[i][j]=dids[y]
      j+=1
  pdl=pred_ids.tolist()
  qrels=json.load(open('/content/cranfield/cran_qrels.json', 'r'))
  eval=Evaluation()
  results=[]
  for i in range(20):
    s=i*10
    e=s+10
    results.append([eval.meanAveragePrecision(pdl[s:e],qids[s:e],qrels,k),eval.meanNDCG(pdl[s:e],qids[s:e],qrels,k),eval.meanPrecision(pdl[s:e],qids[s:e],qrels,k),eval.meanRecall(pdl[s:e],qids[s:e],qrels,k),eval.meanFscore(pdl[s:e],qids[s:e],qrels,k)])
  #print('Using new method with bm25 and LSA- k: '+str(k)+' nDCG: '+str(eval.meanNDCG(pdl,qids,qrels,k)))
  return results,eval.meanNDCG(pdl,qids,qrels,k),eval.meanAveragePrecision(pdl,qids,qrels,k),eval.meanRecall(pdl,qids,qrels,k),eval.meanFscore(pdl,qids,qrels,k),dvecs,svd_transformer,bm25,alpha,dids,docs

In [11]:
#@title Function for LSA with Query expansion
def getsyn(x,k,v,wsim):
  if x in v:
    i= v.index(x)
    y = np.argsort(wsim[i]).T[::-1][:k]
    ans=[v[i] for i in y]
    return ans
  else:
    return []

def train_new_qe(k,k1,k2):
  import json
  dj = json.load(open('/content/cranfield/cran_docs.json', 'r'))
  docs,dids=[item['body']+' '+item['title'] for item in dj],[item['id'] for item in dj]
  qj = json.load(open('/content/cranfield/cran_queries.json', 'r'))
  qrs,qids=[item['query'] for item in qj],[item['query number'] for item in qj]
  #print(len(dids),len(qids))
  #k=5
  vectorizer = TfidfVectorizer(tokenizer=data_preprocess,stop_words=None, 
                             use_idf=True, 
                             smooth_idf=True,max_df=0.6)
  svd_model = TruncatedSVD(n_components=450,         
                          algorithm='randomized',
                          n_iter=10)
  svd_transformer = Pipeline([('tfidf', vectorizer), 
                              ('svd', svd_model)])
  dvecs=svd_transformer.fit_transform(docs)
  wsim=sklearn.metrics.pairwise.cosine_similarity(svd_model.components_.T,svd_model.components_.T)
  v=vectorizer.get_feature_names()
  
  stop_words = set(stopwords.words('english')) 

  for i in range(len(qrs)):
    new_qr=qrs[i]
    tkns=nltk.word_tokenize(qrs[i])
    for x in tkns :
      k2=k2-1
      if k2<=0: break
      if x not in stop_words:
        try:
          new_qr+=(' '+' '.join(getsyn(x,k1+1,v,wsim)[1:]))
        except Exception:
          pass
    #if i<2:
      #print('old-',qrs[i])
      #print('new-',new_qr)
      #print('\n\n')
    qrs[i]=new_qr
  from scipy.sparse import csr_matrix
  #qvecs_qe=np.dot(vectorizer.transform(qrs).todense(),wsim)
  #qvecs=np.dot(qvecs_qe,svd_model.components_.T)
  qvecs=svd_transformer.transform(qrs)
  #print(qvecs.shape)
  A=sklearn.metrics.pairwise.cosine_similarity(qvecs,dvecs)
  pred_ids=np.zeros((255,k),dtype=int)
  for i in range(0,225):
    x=np.argsort(A[i]).T[::-1][:k]
    j=0
    for y in x:
      pred_ids[i][j]=dids[y]
      j+=1
  pdl=pred_ids.tolist()
  qrels=json.load(open('/content/cranfield/cran_qrels.json', 'r'))
  eval=Evaluation()
  results=[]
  for i in range(20):
    s=i*10
    e=s+10
    results.append([eval.meanAveragePrecision(pdl[s:e],qids[s:e],qrels,k),eval.meanNDCG(pdl[s:e],qids[s:e],qrels,k),eval.meanPrecision(pdl[s:e],qids[s:e],qrels,k)])
  print('Using new method with query expansion- k: '+str(k)+' nDCG: '+str(eval.meanNDCG(pdl,qids,qrels,k)))
  return results,eval.meanNDCG(pdl,qids,qrels,k),eval.meanAveragePrecision(pdl,qids,qrels,k),eval.meanRecall(pdl,qids,qrels,k),eval.meanFscore(pdl,qids,qrels,k)

In [12]:
#@title BM25 code
def train_bm25(k):
  import json
  dj = json.load(open('/content/cranfield/cran_docs.json', 'r'))
  docs,dids=[item['body']+' '+item['title'] for item in dj],[item['id'] for item in dj]
  qj = json.load(open('/content/cranfield/cran_queries.json', 'r'))
  qrs,qids=[item['query'] for item in qj],[item['query number'] for item in qj]
  tot_query = len(qids)
  pred_ids=np.zeros((tot_query,k),dtype=int)
  tokenized_corpus = data_preprocess1(docs)
  tokenized_query = data_preprocess1(qrs)
  bm25 = BM25Okapi(tokenized_corpus)

  for i in range(0,tot_query):
    doc_scores = bm25.get_scores(tokenized_query[i])
    top_n = np.argsort(doc_scores)[::-1][:k]
    index = 0
    for j in top_n:
      pred_ids[i][index]=dids[j]
      index = index + 1
  
  pdl=pred_ids.tolist()
  pdl
  qrels=json.load(open('/content/cranfield/cran_qrels.json', 'r'))
  eval=Evaluation()
  results=[]
  for i in range(20):
    s=i*10
    e=s+10
    results.append([eval.meanAveragePrecision(pdl[s:e],qids[s:e],qrels,k),eval.meanNDCG(pdl[s:e],qids[s:e],qrels,k),eval.meanPrecision(pdl[s:e],qids[s:e],qrels,k)])
  print('Using new method with bm25- k: '+str(k)+' nDCG: '+str(eval.meanNDCG(pdl,qids,qrels,k)))
  rf=open('SVD_cosineSim.csv','w')
  rf.write('MAP,NDCG,MP\n')
  for x in results:
    rf.write(str(x[0])+','+str(x[1])+','+str(x[2])+'\n')
  rf.close()
  return results,eval.meanNDCG(pdl,qids,qrels,k),eval.meanAveragePrecision(pdl,qids,qrels,k),eval.meanRecall(pdl,qids,qrels,k),eval.meanFscore(pdl,qids,qrels,k)



In [13]:
#@title Statistcal Hypothesis testing code
import numpy as np
from statistics import variance
from scipy import stats

def test(x,y,a):
  #null hypothesis: mean of x is less than or equal to mean of y
  #alternate hypothesis: mean of x is greater than mean of y
  n=len(x)
  m=n
  df=n+m-2
  x_ , y_ = sum(x)/n, sum(y)/n
  sx ,sy =variance(x),variance(y)
  sp = ((n-1)*sx+(m-1)*sy)/(n+m-2)
  ts = (x_ - y_)/(sp*(1/m+1/n))**0.5
  z_a=stats.t.ppf(1-a,df)
  print('x_:'+str(x_)+' y_:'+str(y_)+' sx :'+str(sx)+' sy: '+str(sy)+' sp: '+str(sp))
  print('Test statistic: ',ts)
  print('t-alpha: ',z_a)
  print('p-value: ',1-stats.t.cdf(ts,df))
  if ts>z_a:
    print("mean of x is greater than mean of y (new method is better)")
  else:
    print("No evidence for proving new method is better")

In [14]:
#@title Custom query handler
def custom_query(q,dvecs,svd,bm25,alpha,dids,docs,k,wsim,vectorizer,algo='mixed'):
  from IPython.display import clear_output
  clear_output(wait=True)
  k1,k2=1,1
  qrs=[q]
  v=vectorizer.get_feature_names()
  
  stop_words = set(stopwords.words('english')) 
  import random
  for i in range(len(qrs)):
    new_qr=qrs[i]
    tkns=nltk.word_tokenize(qrs[i])
    random.shuffle(tkns)
    for x in tkns :
      k2=k2-1
      if k2<=0: break
      if x not in stop_words:
        try:
          new_qr+=(' '+' '.join(getsyn(x,k1+1,v,wsim)[1:]))
        except Exception:
          pass
    #if i<2:
      #print('old-',qrs[i])
      #print('new-',new_qr)
      #print('\n\n')
    qrs[i]=new_qr
  from scipy.sparse import csr_matrix
  #qvecs_qe=np.dot(vectorizer.transform(qrs).todense(),wsim)
  #qvecs=np.dot(qvecs_qe,svd_model.components_.T)
  qvecs=svd.transform([q])
  tokenized_query = data_preprocess1([q])
  A=sklearn.metrics.pairwise.cosine_similarity(qvecs,dvecs)
  A=(A-A.min(axis=1).reshape(-1,1))/A.max(axis=1).reshape(-1,1)
  B=bm25.get_scores(tokenized_query[0])
  B = (B-B.min(axis=0))/B.max(axis=0)
  if algo=='mixed':
    alpha=60.0/(70.0+60.0)
  elif algo=='lsa':
    alpha=1
  else:
    alpha=0
  A=alpha*A+(1-alpha)*B
  pred_ids=np.zeros((255,1),dtype=int)
  rel_docs=np.argsort(A).T[::-1][:k].reshape(1,-1)
  print('Relevant documents:')
  for j in range(k):print(dids[rel_docs[0][j]],end=' ')
  print('\n')
  print('The contents of these documents are:')
  c=1
  for j in range(k):
    print(str(c)+': '+docs[rel_docs[0][j]][:90]+'...')
    c+=1
  return

In [15]:
#@title GUI code
#code for GUI
res = train_mixed(2)
wsim,vectorizer=get_wsim()
dvecs,svd,bm25,alpha,dids,docs = res[5],res[6],res[7],res[8],res[9],res[10]
import ipywidgets as widgets
output = widgets.Output()
TA=widgets.Textarea(
    value='',
    placeholder='Your query',
    description='',
    disabled=False
)
dwl = widgets.Dropdown(
    options=[('BM25', 1), ('BM25+LSA', 2), ('LSA', 3)],
    value=1,
    description='Algorithm: ',
)

dwl1 = widgets.Dropdown(
    options=[('1', 1), ('2', 2), ('3', 3),('4',4),('5',5)],
    value=3,
    description='k:',
)
button = widgets.Button(
    description='Search',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click me',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)

def on_button_clicked(b):
    with output:
      global wsim
      q=TA.value
      algo=dwl.value
      k=dwl1.value
      if algo==1:
        custom_query(q,dvecs,svd,bm25,alpha,dids,docs,k,wsim,vectorizer,'bm25')
      elif algo==2:
        custom_query(q,dvecs,svd,bm25,alpha,dids,docs,k,wsim,vectorizer,'lsa')
      else:
        custom_query(q,dvecs,svd,bm25,alpha,dids,docs,k,wsim,vectorizer,'mixed')
hb=widgets.VBox([TA,dwl,dwl1,button])
button.on_click(on_button_clicked)       
display(hb,output)


VBox(children=(Textarea(value='', placeholder='Your query'), Dropdown(description='Algorithm: ', options=(('BM…

Output()