In [10]:
#implement an Information Retrieval (IR) system based on the vector space model, for a collection of documents
#For weighting, you can use the tf-idf weighting scheme (wij = tfij∙idfi)
#For each query, your system will produce a ranked list of documents, starting with the most similar to the query and ending with the least similar. For the query terms you can use a modified tf-idf weighting scheme wiq = (0.5 + 0.5 tfiq)∙idfi
#For the ranking, you can use the cosine similarity measure
import time
import Porter_Stemming as ps
import pandas as pd
from bs4 import BeautifulSoup
import os
import string

from sklearn.metrics.pairwise import cosine_similarity
import math
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Step 1 Preprocessing
#Input: Documents that are read one by one from the collection
#Output: Tokens to be added to the index (vocabulary)
#Get Start time 
start_time = time.time()

coll_files = [f for f in os.listdir(r"./coll/") if os.path.isfile(os.path.join(r"./coll/", f))]
stop_words = open(r"./stopwords.txt", "r").read().split()

def collect_info(coll_files, stop_words):
    files = {} 
    list_of_words = [] 
    bert_files = []
    bert_docnos = []
    vocabulary = set([]) #We use a set because we don't want to have duplicates in the vocabulary
    total_length = len(coll_files)
    count = 0
    for file in coll_files: #For each file in the collection
        count += 1
        with open(r"./coll/" + file, "r") as f: #Open the file
            soup = BeautifulSoup(f, 'lxml') 
            
            for doc in soup.find_all('doc'): #For each document in the file
                docno = doc.find_all('docno')[0].text.strip() #Get the docno

                temptxt = str(doc.find('text')).replace('<text>', ' ').replace('</text>', ' ').replace('\n', ' ').replace('\t', ' ') #Get the text, replace the tags and new lines with spaces
                temptxt = temptxt.lower()
                temptxt = temptxt.translate(str.maketrans("'", " ")) #Remove apostrophes
                temptxt = temptxt.translate(str.maketrans("", "", string.punctuation)) #Remove punctuation
                temptxt = temptxt.translate(str.maketrans(string.digits, " " * len(string.digits)))
                
                list_of_words = temptxt.split()
            
                temptxt = [word for word in list_of_words if word not in stop_words]
                
                porter = ps.PorterStemmer()
                temptxt = [porter.stem(word, 0, len(word)-1) for word in temptxt]
                
                #temptxt = list(set(list_of_words) - set(stop_words))
                
                listtxt = " ".join(temptxt)
                
                bert_files.append(listtxt)
                bert_docnos.append(docno)
                
                vocabulary.update(set(temptxt))
                files.update({docno: temptxt}) #We cannot check the length of the list of words because we don't know how many words are in the stop words list
        print ("Progress: " + str(count) + "/" + str(total_length))
    return files, vocabulary, bert_files, bert_docnos

In [3]:
pos_tag_map = {
        'NN': [ wn.NOUN ],
        'JJ': [ wn.ADJ, wn.ADJ_SAT ],
        'RB': [ wn.ADV ],
        'VB': [ wn.VERB ],
        

    }

def expand_tokens(token_tags):
    expanded_tokens = []
    for i in range(len(token_tags)):
        syns = wn.synsets(token_tags[i][0], pos_tag_map[token_tags[i][1][0:2]])
        for syn in syns:
            for l in syn.lemmas():
                expanded_tokens.append(l.name())

    # Remove duplicates
    expanded_tokens = list(set(expanded_tokens))
    refined_tokens = []
    # Only keep tokens that are similar to multiple words in the original query using wup_similarity
    for i in range(len(expanded_tokens)):
        for j in range(len(token_tags)):
            if i != j and len(wn.synsets(expanded_tokens[i])) >= 1 and len(wn.synsets(token_tags[j][0])) >= 1:
                
                if wn.synsets(expanded_tokens[i])[0].wup_similarity(wn.synsets(token_tags[j][0])[0]) > 0.8: #nice
                    # If the token has an underscore, split it and append as multiple words
                    if '_' in expanded_tokens[i]:
                        # refined_tokens += expanded_tokens[i].lower().split('_')
                        # break
                        continue
                    else:
                        refined_tokens.append(expanded_tokens[i].lower())
                        break
                    
    # Remove duplicates
    refined_tokens = list(set(refined_tokens))

    return refined_tokens

def collect_queries():
    queries = {}
    expandos = {}
    list_of_words = [] 
    bert_queries = []
    
    
    with open(r"./topics1-50.txt", "r") as f:
        soup = BeautifulSoup(f, 'lxml') 
        
        for top in soup.find_all('top'):
            num = top.find_all('num')[0].text.strip()[0:2].strip() #this is a stupid way to do it, don't follow my example 

            temptxt = str(top.find('title')).replace('<title>', ' ').replace('</title>', ' ').replace('\n', ' ') #replace the title tag with either title or top to test different query sections
            
            temptxt += (" " +str(top.find('desc')).replace('<desc>', ' ').replace('</desc>', ' ').replace('\n', ' ')) #Uncomment to do both title and description
            temptxt = temptxt.lower()
            temptxt = temptxt.translate(str.maketrans("'", " ")) #Remove apostrophes
            temptxt = temptxt.translate(str.maketrans("", "", string.punctuation)) #Remove punctuation
            temptxt = temptxt.translate(str.maketrans(string.digits, " " * len(string.digits))) #Remove digits
            # descpoint = temptxt.find('desc')
            
            # temptxt = temptxt[0:descpoint]
            list_of_words = temptxt.split()
        
            temptxt = [word for word in list_of_words if (word not in stop_words and word not in ['narr', 'desc', 'narrdesc'])] 
            
            token_tags = nltk.pos_tag(temptxt) #looks like [('word', 'POS'), ('word', 'POS'), ...]
            token_tags = [word for word in token_tags if word[1][0:2] in pos_tag_map] #remove words that don't have a pos tag in the map
            # for item in token_tags:
            #     print(wn.synsets(item[0], pos_tag_map[item[1][0:2]]))
            
            # for i in range(len(token_tags)):
            #     token_tags[i] = pos_tag_map[token_tags[i][1][0:2]][0]
            expended_tokens = expand_tokens(token_tags)
            

            
            porter = ps.PorterStemmer()
            temptxt = [porter.stem(word, 0, len(word)-1) for word in temptxt]


            listtxt = " ".join(temptxt)

            # Stem the expanded_tokens
            expanded_tokens = [porter.stem(word, 0, len(word)-1) for word in expended_tokens]

            #remove duplicates from expanded tokens
            expanded_tokens = list(set(expanded_tokens))
            
            
            bert_queries.append(listtxt)
            
            queries.update({num: temptxt}) #We cannot check the length of the list of words because we don't know how many words are in the stop words list
            expandos.update({num: expanded_tokens})

    return queries, expandos, bert_queries

In [4]:
files, vocabulary, bert_files, bert_docnos = collect_info(coll_files, stop_words)
number_of_documents = len(files)
print("vocabulary length: " , len(vocabulary))
print("files length: " , len(files))

Progress: 1/322
Progress: 2/322
Progress: 3/322
Progress: 4/322
Progress: 5/322
Progress: 6/322
Progress: 7/322
Progress: 8/322
Progress: 9/322
Progress: 10/322
Progress: 11/322
Progress: 12/322
Progress: 13/322
Progress: 14/322
Progress: 15/322
Progress: 16/322
Progress: 17/322
Progress: 18/322
Progress: 19/322
Progress: 20/322
Progress: 21/322
Progress: 22/322
Progress: 23/322
Progress: 24/322
Progress: 25/322
Progress: 26/322
Progress: 27/322
Progress: 28/322
Progress: 29/322
Progress: 30/322
Progress: 31/322
Progress: 32/322
Progress: 33/322
Progress: 34/322
Progress: 35/322
Progress: 36/322
Progress: 37/322
Progress: 38/322
Progress: 39/322
Progress: 40/322
Progress: 41/322
Progress: 42/322
Progress: 43/322
Progress: 44/322
Progress: 45/322
Progress: 46/322
Progress: 47/322
Progress: 48/322
Progress: 49/322
Progress: 50/322
Progress: 51/322
Progress: 52/322
Progress: 53/322
Progress: 54/322
Progress: 55/322
Progress: 56/322
Progress: 57/322
Progress: 58/322
Progress: 59/322
Progre

In [5]:
queries, expanded_queries, bert_queries = collect_queries()


In [6]:
def BERT_similarities(documents, queries, docnos):
    #Iterate over queries and documents, encode one query and one document at a time using BERT
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    similarities = {i:{} for i in range(len(queries))}
    for i in range(len(queries)):
        for j in range(len(documents)):
            #If the document contains 2 words in the query, compute the similarity otherwise, skip it
            if any(word in documents[j] for word in queries[i]):
                #Join the query and doc into one list
                combined = [queries[i], documents[j]]
                #Encode the query and document
                embedding = model.encode(combined)
                #Compute the cosine similarity between the query and document
                similarity = cosine_similarity(embedding[0].reshape(1,-1), embedding[1].reshape(1,-1))
                #Store the similarity in the similarities dictionary
                similarities[i][docnos[j]] = similarity[0][0]
                if j % 100 == 0:
                    print("similarity prog: ", j, " of ", len(documents))
        print("similarity prog: ", i, " of ", len(queries))
    return similarities

            


In [14]:
def full_encode_similarities(documents, queries, docnos):
    combined = queries+ documents
    query_ids = [str(i) for i in range(len(queries))]
    combined_docnos = query_ids+docnos
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(combined, show_progress_bar=True)
    similarities = cosine_similarity(embeddings)
    similarities_sorted = similarities.argsort()

    #Create a dictionary of dictionaries to store the similarities with the docnos as keys for each sub dictionary
    print(combined_docnos)
    score_dict = {i:{} for i in combined_docnos}
    # for index,array in enumerate(similarities_sorted):
    #     id_2 = array[-2]
    #     score = similarities[index][id_2]
    #     score_dict[combined_docnos[index]][combined_docnos[id_2]] = score

    id_1 = []
    id_2 = []
    score = []
    for index,array in enumerate(similarities_sorted):
        print(array)
        id_1.append(combined_docnos[index])
        id_2.append(combined_docnos[array[-2]])
        score.append(similarities[index][array[-2]])

    # index_df = pd.DataFrame(score_dict)
    index_df = pd.DataFrame({'id_1' : id_1,
                          'id_2' : id_2,
                          'score' : score})
    return score_dict, index_df


    
    print("similarity prog: ", i, " of ", len(queries))

In [29]:
def STS_similarity(documents, queries, docnos):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embeddings1 = model.encode(queries, convert_to_tensor=True)
    embeddings2 = model.encode(documents, convert_to_tensor=True)
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    #Output the pairs with their score
    return cosine_scores
    for i in range(len(docnos)):
        print("{} \t\t {} \t\t Score: {:.4f}".format(docnos[i], i, cosine_scores[i][i]))

In [23]:
def mine_paraphrases(documents, queries, docnos):
    combined = queries+ documents
    query_ids = [str(i) for i in range(len(queries))]
    combined_docnos = query_ids+docnos
    model = SentenceTransformer('all-MiniLM-L6-v2')
    paraphrases = util.paraphrase_mining(model, combined, show_progress_bar=True)
    for paraphrase in paraphrases[0:10]:
        score, i, j = paraphrase
        print("{} \t\t {} \t\t Score: {:.4f}".format(combined_docnos[i], combined_docnos[j], score))

In [15]:
simiarities, df = full_encode_similarities(bert_files[0:3], bert_queries[0:3], bert_docnos[0:3])

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]

['0', '1', '2', 'AP880212-0001', 'AP880212-0002', 'AP880212-0003']
[4 5 3 1 2 0]
[4 5 3 0 2 1]
[4 3 5 0 1 2]
[4 2 1 0 5 3]
[2 0 1 3 5 4]
[4 2 0 1 3 5]





In [30]:
beepis = mine_paraphrases(bert_files[0:200], bert_queries[0:200], bert_docnos[0:200])

Batches:   0%|          | 0/8 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [31]:
borgipis = STS_similarity(bert_files[0:1000], bert_queries[0], bert_docnos)

In [39]:
borgipis
for i in range(len(bert_docnos)):
        print("{} \t\t {} \t\t Score: {:.4f}".format(bert_docnos[i], i, borgipis[i]))

TypeError: unsupported format string passed to Tensor.__format__

In [None]:
np.shape(bert_embeddings)
similarities = cosine_similarity(bert_embeddings)
similarities_sorted = similarities.argsort()

