In [23]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

class LemmatizerWithPOSTagger(WordNetLemmatizer):
    def __init__(self):
        pass

    def _get_wordnet_pos(self, tag: str) -> str:
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize(self, word: str, pos: str = "n") -> str:
        return super().lemmatize(word, self._get_wordnet_pos(pos))

In [66]:
import re
import string
from typing import Callable
from typing import List
import emoji
import numpy as np
from nltk import tokenize, pos_tag
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from num2words import num2words
from nltk.corpus import wordnet
from dateutil import parser
import nltk
from nltk.tokenize import word_tokenize

class TextPreprocessor():

    def __init__(self, tokenizer: Callable = None) -> None:
        self.tokenizer = tokenizer

        if self.tokenizer is None:
            self.tokenizer = tokenize.word_tokenize

        self.stopwords_tokens = stopwords.words('english')
        self.stemmer = PorterStemmer()
        self.lemmatizer = LemmatizerWithPOSTagger()

    def tokenize(self, text: str)-> List[str]:
        tokens =self.tokenizer(text)
        return tokens
    
    def to_lower(self, tokens: List[str]) -> List[str]:
        lower_tokens = []
        for token in tokens:
            lower_token = str(np.char.lower(token))
            lower_tokens.append(lower_token)
        return lower_tokens

    
    def remove_markers(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'\u00AE', '', token))
        return new_tokens

    def remove_punctuation(self, tokens: List[str]) ->  List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))
        return new_tokens


    def correct_sentence_spelling(self, tokens: List[str]) -> List[str]:
        spell = SpellChecker()
        misspelled = spell.unknown(tokens)
        for i,token in enumerate(tokens):
            if token in misspelled :
                corrected=spell.correction(token)
                if(corrected!=None):
                    tokens[i]=corrected
        return tokens  

    def rplace_under_score_with_space(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'_', ' ', token))
        return new_tokens

    def remove_stop_words(self,tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            if token not in self.stopwords_tokens and len(token) > 1:
                new_tokens.append(token)
        return new_tokens

    def remove_apostrophe(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(str(np.char.replace(token, "'", " ")))
        return new_tokens

    def stemming(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(self.stemmer.stem(token))
        return new_tokens
    
    
    def normalize_appreviations(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens
    
    def lemmatizing(self, tokens: List[str]) -> List[str]:
        tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, pos) for token, pos in tagged_tokens]
        return lemmatized_tokens


    def preprocess(self, text: str) -> str:
        operations = [
            self.to_lower,
            self.remove_punctuation,
            self.remove_apostrophe,
            self.remove_stop_words,
            self.remove_markers,
            self.stemming,
            self.lemmatizing,
            self.normalize_appreviations, 
            self.to_lower,
            self.rplace_under_score_with_space
        ]
        text_tokens=self.tokenize(text)
        for op in operations:
              text_tokens=op(text_tokens)
    
        new_text=""
        new_text = ' '.join(text_tokens)
            
        return new_text

In [65]:
def process_text(document:str):
    return TextPreprocessor().preprocess(document)

In [67]:
import numpy as np

def vectorize(documents):
    documents_vectors = []
    i=0
    for document in documents:
        zero_vector = np.zeros(500)
        vectors = []
        for token in document:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    vectors.append(np.random(500))
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            documents_vectors.append(avg_vec)
        else:
            documents_vectors.append(zero_vector)
    return documents_vectors

In [68]:
import pymongo

from pymongo import MongoClient

client = MongoClient("localhost:27017")

db = client["IR"]
col = db["antique"]

In [69]:
import pickle
with open('D:/antique/word2vec/documents_vectors.pickle', 'rb') as handle:
    documents_vectors = pickle.load(handle)

KeyboardInterrupt: 

In [18]:
from gensim.models import Word2Vec

model=Word2Vec.load("D:/antique/word2vec/model")

In [61]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def get_results(query_fin):
    similarities = cosine_similarity(documents_vectors, vectorize([word_tokenize(process_text(query_fin))])[0].reshape(1, -1))

    sorted_indices = similarities.argsort(axis=0)[-10:][::-1].flatten()
    result_ids= []
    
    for i in sorted_indices:
        if(similarities[i][0]>=0.35):
            result_ids.append(int(i))

    unordered_results= list(col.find({'index':{'$in':result_ids} }))
    
    return sorted(unordered_results, key=lambda x: result_ids.index(x['index']))

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
df=pd.DataFrame(get_results('university of harvard'))
df = df.style.set_properties(**{'text-align': 'left'})

df

Unnamed: 0,_id,index,content
0,3971923_11,399731,Go to harvard college
1,1566767_8,206419,You have to know people inside to get into Harvard.
2,3991687_0,21095,And what makes you think you'll get into Harvard or Yale? Spelling counts when you fill out the application.
3,2552035_0,5722,"Harvard College is for undergraduate education. This is the four year (typically) collegiate education.. . Harvard University is the umbrella organization for Harvard College, Harvard Law, Harvard Business, Harvard Medical, Radcliffe Institute, the Graduate School of Design, the School of Education, etc."
4,3465124_1,339538,"I don't know how ""mush"" studying at Harvard would cost."
5,3135464_1,36284,"Harvard is only one of the best university's in the world.Some of the greatest men and women in history went to Harvard.If you'd do a little checking,you'd see why Harvard has such a remarkable history."
6,2654578_0,273472,I think you are talking about the Ransom of Hector (hektor). on the Perseus Vase at harvard
7,863700_17,279545,Michael Moore is afraid to go on his show...what are you talking about? I highly doubt he is ignorant. He attended Harvard University!
8,3017886_1,73346,"If this is how you do your research, the question really is moot, nes pas?. . Hint: check with Harvard Business School."
9,124989_3,259615,It's a place where you can go to school after high school. Like Harvard. If you learn to spell well enuf.


In [60]:
import ir_datasets

dataset = ir_datasets.load("antique/test")

def calculate_MAP(query_id):
    relevant_docs =[]
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id:
            relevant_docs.append(qrel[1])
    
    ordered_results=[]
    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results=get_results(query[1])
            break

    pk_sum=0
    total_relevant=0
    for i in range(1,11):
        relevant_ret=0
        for j in range(i):
            if(j<len(ordered_results) and ordered_results[j]['_id'] in relevant_docs):
                relevant_ret += 1
        p_at_k= (relevant_ret/(i)) * (1 if i-1<len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum+=p_at_k
        if(i-1<len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs):
            total_relevant+=1

    return 0 if total_relevant==0 else pk_sum/total_relevant


queries_ids={}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel[0]:''})

map_sum=0
for query_id in list(queries_ids.keys()):
    map_sum+= calculate_MAP(query_id)

print(map_sum/dataset.queries_count())

0.8701388888888888
0.6696344474363822


In [62]:
import ir_datasets
from sklearn.metrics import precision_score

dataset = ir_datasets.load("antique/test")

def calculate_recall_precision(query_id):
    relevant_docs = []
    retrieved_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id:
            if qrel[2] > 0:
                relevant_docs.append(qrel[1])

    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs=get_results(query[1])
            break  
            
    truncated_retrieved_docs = [obj['_id'] for obj in retrieved_docs[:len(relevant_docs)]]
    y_true = [1 if result['_id'] in relevant_docs else 0 for i, result  in enumerate(retrieved_docs)]
    true_positives = sum([1 for i in range(len(y_true)) if y_true[i]==1])
    recall_at_10 = true_positives/len(relevant_docs)
    precision_at_10 = true_positives / 10
    print(f"Query ID: {query_id}, Recall@10: {recall_at_10}")
    print(f"Query ID: {query_id}, Precision@10: {precision_at_10}")    
    
    return recall_at_10


queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel[0]: ''})
    
for query_id in list(queries_ids.keys()):
    calculate_recall_precision(query_id)

Query ID: 1964316, Recall@10: 0.21212121212121213
Query ID: 1964316, Precision@10: 0.7
Query ID: 2418598, Recall@10: 0.10810810810810811
Query ID: 2418598, Precision@10: 0.4
Query ID: 1167882, Recall@10: 0.06666666666666667
Query ID: 1167882, Precision@10: 0.2
Query ID: 1880028, Recall@10: 0.0967741935483871
Query ID: 1880028, Precision@10: 0.3
Query ID: 2192891, Recall@10: 0.13157894736842105
Query ID: 2192891, Precision@10: 0.5
Query ID: 949154, Recall@10: 0.25
Query ID: 949154, Precision@10: 0.7
Query ID: 1844896, Recall@10: 0.16666666666666666
Query ID: 1844896, Precision@10: 0.5
Query ID: 2634143, Recall@10: 0.06666666666666667
Query ID: 2634143, Precision@10: 0.2
Query ID: 2382487, Recall@10: 0.05454545454545454
Query ID: 2382487, Precision@10: 0.3
Query ID: 229303, Recall@10: 0.1111111111111111
Query ID: 229303, Precision@10: 0.4
Query ID: 1015624, Recall@10: 0.22727272727272727
Query ID: 1015624, Precision@10: 0.5
Query ID: 2785579, Recall@10: 0.07692307692307693
Query ID: 2785

Query ID: 1850323, Recall@10: 0.07317073170731707
Query ID: 1850323, Precision@10: 0.3
Query ID: 4018891, Recall@10: 0.10714285714285714
Query ID: 4018891, Precision@10: 0.9
Query ID: 4462511, Recall@10: 0.24
Query ID: 4462511, Precision@10: 0.6
Query ID: 1663853, Recall@10: 0.08571428571428572
Query ID: 1663853, Precision@10: 0.3
Query ID: 3382736, Recall@10: 0.6666666666666666
Query ID: 3382736, Precision@10: 0.6
Query ID: 1937374, Recall@10: 0.10714285714285714
Query ID: 1937374, Precision@10: 0.3
Query ID: 1373069, Recall@10: 0.125
Query ID: 1373069, Precision@10: 0.4
Query ID: 443848, Recall@10: 0.3
Query ID: 443848, Precision@10: 0.9
Query ID: 4473331, Recall@10: 0.125
Query ID: 4473331, Precision@10: 0.4
Query ID: 3074429, Recall@10: 0.1282051282051282
Query ID: 3074429, Precision@10: 0.5
Query ID: 78762, Recall@10: 0.0
Query ID: 78762, Precision@10: 0.0
Query ID: 3269759, Recall@10: 0.11764705882352941
Query ID: 3269759, Precision@10: 0.4
Query ID: 4197214, Recall@10: 0.1219512

In [58]:
import ir_datasets

dataset = ir_datasets.load("antique/test")

def calculate_MRR(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id :
            relevant_docs.append(qrel[1]) 
            
    ordered_results = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results = get_results(query[1])
            break
    for i, result in enumerate(ordered_results):
        if result['_id'] in relevant_docs:
            return 1 / (i+1)
    
    return 0

queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel[0]: ''})

mrr_sum = 0
for query_id in list(queries_ids.keys()):
    mrr_sum += calculate_MRR(query_id)

print(mrr_sum / dataset.queries_count())

0.7664983766233766
