In [43]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

class LemmatizerWithPOSTagger(WordNetLemmatizer):
    def __init__(self):
        pass

    def _get_wordnet_pos(self, tag: str) -> str:
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize(self, word: str, pos: str = "n") -> str:
        return super().lemmatize(word, self._get_wordnet_pos(pos))

In [46]:
import re
import string
from typing import Callable
from typing import List
from nltk.tokenize import word_tokenize
import emoji
import numpy as np
from nltk import tokenize, pos_tag
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from num2words import num2words
from nltk.corpus import wordnet
from dateutil import parser
import nltk

class TextPreprocessor():

    def __init__(self, tokenizer: Callable = None) -> None:
        self.tokenizer = tokenizer

        if self.tokenizer is None:
            self.tokenizer = tokenize.word_tokenize

        self.stopwords_tokens = stopwords.words('english')
        self.stemmer = PorterStemmer()
        self.dateNormalizer=NormalizerDates()
        self.lemmatizer = LemmatizerWithPOSTagger()

    def tokenize(self, text: str)-> List[str]:
        tokens =self.tokenizer(text)
        return tokens
    
    def to_lower(self, tokens: List[str]) -> List[str]:
        lower_tokens = []
        for token in tokens:
            lower_token = str(np.char.lower(token))
            lower_tokens.append(lower_token)
        return lower_tokens

    
    def remove_markers(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'\u00AE', '', token))
        return new_tokens

    def remove_punctuation(self, tokens: List[str]) ->  List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))
        return new_tokens


    def correct_sentence_spelling(self, tokens: List[str]) -> List[str]:
        spell = SpellChecker()
        misspelled = spell.unknown(tokens)
        for i,token in enumerate(tokens):
            if token in misspelled :
                corrected=spell.correction(token)
                if(corrected!=None):
                    tokens[i]=corrected
        return tokens  

    def rplace_under_score_with_space(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'_', ' ', token))
        return new_tokens

    def remove_stop_words(self,tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            if token not in self.stopwords_tokens and len(token) > 1:
                new_tokens.append(token)
        return new_tokens

    def remove_apostrophe(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(str(np.char.replace(token, "'", " ")))
        return new_tokens

    def stemming(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(self.stemmer.stem(token))
        return new_tokens
    
    
    def normalize_appreviations(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens
    
    def lemmatizing(self, tokens: List[str]) -> List[str]:
        tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, pos) for token, pos in tagged_tokens]
        return lemmatized_tokens


    def preprocess(self, text: str) -> str:
        operations = [
            self.to_lower,
            self.remove_punctuation,
            self.remove_apostrophe,
            self.remove_stop_words,
            self.remove_markers,
            self.stemming,
            self.lemmatizing,
            self.normalize_appreviations, 
            self.to_lower,
            self.rplace_under_score_with_space
        ]
        text_tokens=self.tokenize(text)
        for op in operations:
              text_tokens=op(text_tokens)
    
        new_text=""
        new_text = ' '.join(text_tokens)
            
        return new_text

In [45]:
def process_text(document:str):
    return TextPreprocessor().preprocess(document)

In [52]:
import numpy as np

def vectorize(documents):
    documents_vectors = []
    i=0
    for document in documents:
        zero_vector = np.zeros(500)
        vectors = []
        for token in document:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    vectors.append(np.random(500))
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            documents_vectors.append(avg_vec)
        else:
            documents_vectors.append(zero_vector)
    return documents_vectors

In [48]:
import pickle
with open('D:/wikir/word2vec/documents_vectors.pickle', 'rb') as handle:
    documents_vectors = pickle.load(handle)

In [49]:
import pymongo

from pymongo import MongoClient

client = MongoClient("localhost:27017")

db = client["IR"]
col = db["wikir"]

In [50]:
from gensim.models import Word2Vec

model=Word2Vec.load("D:/wikir/word2vec/model")

In [53]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def get_results(query_fin):
    similarities = cosine_similarity(documents_vectors, vectorize([word_tokenize(process_text(query_fin))])[0].reshape(1, -1))

    sorted_indices = similarities.argsort(axis=0)[-10:][::-1].flatten()
    result_ids= []
    
    for i in sorted_indices:
        if(similarities[i][0]>=0.35):
            result_ids.append(int(i))

    unordered_results= list(col.find({'index':{'$in':result_ids} }))
    
    return sorted(unordered_results, key=lambda x: result_ids.index(x['index']))

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
df=pd.DataFrame(get_results('america'))
df = df.style.set_properties(**{'text-align': 'left'})

df

Unnamed: 0,_id,index,content
0,1742906,145236,for his contributions to american railroad management jeffery is listed by the smithsonian institutions john h white jr as one of america s most noteworthy railroaders
1,1511320,13199,it operates several chains of retail brands in the consumer durables sector specializing in furniture audio video appliances and electronics in over 1 000 stores in central america the caribbean south america and the united states employing over 15 000 associates the unicomer group was founded in 2000 unicomer group owns large brands such as la cura ao central america and the dominican republic almacenes tropigas in central america gollo in costa rica artefacta in ecuador and electro facil in paraguay in the caribbean region unicomer group operates through its retail brand courts other brands in the region are lucky dollar omni amc unicon among others unicomer usa and courts caribbean located in the united states as of april 15 2015 unicomer group acquired brands intellectual property and contracts of existing radioshack franchisees throughout central america south america and the caribbean holding a promise to expand presence of this chain in these regions as new operations and with existing franchisees since the year 2000 unicomer group has grown from operating 4 store chains in central america to operating more than 30 brands in 26 countries in central america south america the caribbean islands and usa the central american retail chain
2,1820054,112357,startech com services a worldwide market with operations throughout the united states canada europe latin america and taiwan the company headquarters is located in london ontario canada with distribution centers in the united states canada the united kingdom and singapore startech com was founded in 1985 in london ontario canada by paul seed and ken kalopsis the company s first products to enter the it market were anti glare screens for crt computer monitors and keyboard dust covers although startech com has been active in the canadian and united states it markets since the company s beginning it was not until 2004 that startech com decided to focus on becoming a more globalized company with the opening of a branch in northampton uk in 2010 the company further expanded their uk operation with the appointment of a business manager uk country manager and national account manager the same year both the uk and usa warehouses were relocated to better accommodate demand for products by 2012 startech com was selling products in many european markets including france spain italy benelux as well as mexico in 2019 startech com was selling in 20 countries worldwide and has plans for further expansion the
3,775669,219988,the central and southern portions of the continent are represented by the united states mexico and numerous smaller states primarily in central america and in the caribbean the continent is delimited on the southeast by most geographers at the dari n watershed along the colombia panama border placing all of panama within north america alternatively a less common view would end north america at the man made panama canal islands generally associated with north america include greenland the world s largest island and archipelagos and islands in the caribbean the terminology of the americas is complex but anglo america can describe canada and the u s while latin america comprises mexico and the countries of central america and the caribbean as well as the entire continent of south america natural features of north america include the northern portion of the american cordillera represented by the geologically new rocky mountains in the west and the considerably older appalachian mountains to the east the north hosts an abundance of glacial lakes formed during the last glacial period including the great lakes north america s major continental divide is the great divide which runs north and south down through rocky mountains the major
4,1146970,241090,coach america consisted of all former coach usa operations except for the midwestern united states new york new jersey pennsylvania and new england along with lakefront lines in ohio acquired separately for the nine years of its existence coach america was based in dallas texas the properties that became coach america were previously owned by scotland based stagecoach group as coach usa s western south central and southeastern divisions coach america was formed in 2003 when after stagecoach group evaluated its coach usa business it decided to retain mostly its scheduled and local transit services in the northeast and north central region and put the rest of the company up for sale the south central and west divisions of coach usa were sold to kohlberg co llc with these companies continuing to use the coach usa name for a time but eventually changing to coach america the southeast division was sold to a separate buyer lincolnshire management and became american coach lines in 2006 coach america purchased american coach lines from lincolnshire in november of that same year kohlberg sold coach america to another private equity firm fenway partners coach america acquired the ohio carrier lakefront lines in 2008 in
5,225121,127741,the canadian province of quebec is the centre of the community and is the point of origin of most of french america it also includes communities in all provinces of canada especially in new brunswick where francophones are roughly one third of the population saint pierre and miquelon haiti saint martin saint barth lemy saint lucia martinique and guadeloupe in the caribbean french guiana overseas region of france in south america also there are minorities of french speakers in part of the united states new england louisiana florida dominica grenada and trinidad and tobago the ordre des francophones d am rique is a decoration given in the name of the community to its members it can also be described as the francophonie of the americas because french is a romance language french america is sometimes considered to be part of latin america but this term more often refers to hispanic america and portuguese america or simply the americas south of the united states this is a list of countries administrative divisions and french possessions in the americas having the french language as an official language the data of each place are based in the 2012 2013 census
6,1645290,341617,from 2000 through 2013 he was professor of music at the university of southampton subsequently having become entirely disenchanted with the uk s tertiary educational system he took early retirement and was awarded the title of emeritus professor of music though he later relinquished this post from 1987 to 2000 he was professor of music and sometime research dean of the faculty of humanities at keele university between 1984 and 1987 nicholls was keasbey fellow in american studies at selwyn college cambridge in 1998 he spent an extended semester at the college of william and mary in virginia usa as visiting professor of music during his academic career nicholls gave many presentations both refereed and guest in north america france germany mexico australia taiwan and the united kingdom he is a former editor of the journal american music 2000 2005 until 2000 nicholls was also active as a composer and his works were performed and broadcast in the united kingdom europe america australia and south africa nicholls was a pupil at st benedict s primary school in small heath birmingham and subsequently king edward vi camp hill school for boys in king s heath he then read music at st
7,1777377,212325,tilyard went on to produce battleground with ayz waraich the 80 s style cult action film was released internationally in 2011 and is scheduled for release in north america in 2012 official site www dimeworth com
8,1200775,188504,the show runs for two hours on sundays from 9 to 11 a m pacific time dhuyvetter typically broadcasts from her studio in encinitas california but often takes the show on the road she has broadcast live from remote sites in asia africa europe south america and north america traveltalkradio focuses on travel and tourism and began airing in 2001 each week host dhuyvetter speaks with experts from around the world in the travel industry as well as authors of travel books and publishers of travel magazines the program introduces audiences to regional national and international travel topics with the goal of facilitating communications between tour operators wholesales travel agents and consumers dhuyvetter is quoted as saying that traveltalkradio was one of the first internet radio shows traveltalkradio is affiliated with traveltalkmedia which encompasses the web television and radio aspects of the company the show is distributed by conventional radio stations in the united states united kingdom africa and china including on china national radio via satellite traveltalkradio is distributed in the western hemisphere south america and the pacific rim traveltalkradio and traveltalkmedia fall under the parent company celestialink llc
9,2159177,248401,in the western hemisphere this took the form of organizing against the expansion of american commercial influence in the developing nations of central and south america as well as the caribbean basin including especially mexico the dominican republic cuba and nicaragua in the united states itself the anti imperialist department of the well funded workers communist party of america was charles shipman 1895 1989 a draft resisting american expatriate to mexico who as jes s ram rez had been a delegate representing that country at the 2nd world congress of the comintern in addition to latin american concerns shipman s department had also propagandized against american commercial and military involvement in other parts of the globe including particularly the philippines and china in april 1925 shipman was dispatched to mexico as the representative of the workers party to the 3rd congress of the communist party of mexico it was at this time that a new international organization was launched the all america anti imperialist league an organization which would eventually include national sections throughout latin america the term all america in the organizational moniker was not intended to relate specifically to the united states but rather to the fact that


In [55]:
qrels = {}
with open('D:/wikir/test/qrels','r')as f:
    for line in f:
        query_id, _, doc_id, relevance = line.strip().split()
        if query_id not in qrels:
            qrels[query_id] = {}
        qrels[query_id][doc_id] = int(relevance)
# Create a dataset object from the qrels dictionary
from collections import defaultdict
dataset = defaultdict(dict)
for query_id, doc_dict in qrels.items():
    for doc_id, relevance in doc_dict.items():
        dataset[query_id][doc_id] = relevance

In [56]:
import csv

def get_query(query_id):
    with open('D:/wikir/test/queries.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)
        for row in reader:
            if row[0] == query_id:
                return row[1] 
    
    
def calculate_MAP(query_id):
    relevant_docs =[]
    for qrel in dataset.items():
        if qrel[0] == query_id :
            for key, value in qrel[1].items():
                relevant_docs.append(key)
    
    ordered_results=[]
    for query in dataset.items():
        if query[0] == query_id:
            ordered_results=get_results(get_query(query_id))
            break
    
    pk_sum=0
    total_relevant=0
    for i in range(1,11):
        relevant_ret=0
        for j in range(i):
            if(j<len(ordered_results) and ordered_results[j]['_id'] in relevant_docs):
                relevant_ret += 1
        p_at_k= (relevant_ret/(i)) * (1 if i-1<len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum+=p_at_k
        if(i-1<len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs):
            total_relevant+=1
    
    return 0 if total_relevant==0 else pk_sum/total_relevant




queries_ids={}
for qrel in dataset.items():
    queries_ids.update({qrel[0]:''})

    
map_sum=0
for query_id in list(queries_ids.keys()):
     map_sum+= calculate_MAP(query_id)

print(map_sum/len(dataset))

0.5147344104308391


In [38]:
def calculate_recall_precision(query_id):
    relevant_docs =[]
    for qrel in dataset.items():
        if qrel[0] == query_id :
            for key, value in qrel[1].items():
                relevant_docs.append(key)

    retrieved_docs=[]
    for query in dataset.items():
        if query[0] == query_id:
            retrieved_docs=get_results(get_query(query_id))
            break
            
    truncated_retrieved_docs = [obj['_id'] for obj in retrieved_docs[:len(relevant_docs)]]
    y_true = [1 if result['_id'] in relevant_docs else 0 for i, result  in enumerate(retrieved_docs)]
    true_positives = sum([1 for i in range(len(y_true)) if y_true[i]==1])
    recall_at_10 = true_positives/len(relevant_docs)
    precision_at_10 = true_positives / 10
    print(f"Query ID: {query_id}, Recall@10: {recall_at_10}")
    print(f"Query ID: {query_id}, Precision@10: {precision_at_10}")    
    
    return recall_at_10


queries_ids = {}
for qrel in dataset.items():
    queries_ids.update({qrel[0]:''})

mrr_sum = 0
for query_id in list(queries_ids.keys()):
    calculate_recall_precision(query_id)

Query ID: 158491, Recall@10: 0.1111111111111111
Query ID: 158491, Precision@10: 0.1
Query ID: 5728, Recall@10: 0.13333333333333333
Query ID: 5728, Precision@10: 0.2
Query ID: 13554, Recall@10: 0.0
Query ID: 13554, Precision@10: 0.0
Query ID: 32674, Recall@10: 0.8571428571428571
Query ID: 32674, Precision@10: 0.6
Query ID: 406391, Recall@10: 0.125
Query ID: 406391, Precision@10: 0.1
Query ID: 5115, Recall@10: 0.125
Query ID: 5115, Precision@10: 0.1
Query ID: 15469, Recall@10: 0.08333333333333333
Query ID: 15469, Precision@10: 0.2
Query ID: 62953, Recall@10: 0.3333333333333333
Query ID: 62953, Precision@10: 0.2
Query ID: 152444, Recall@10: 0.1111111111111111
Query ID: 152444, Precision@10: 0.1
Query ID: 104086, Recall@10: 0.2
Query ID: 104086, Precision@10: 0.2
Query ID: 145194, Recall@10: 0.16666666666666666
Query ID: 145194, Precision@10: 0.1
Query ID: 73752, Recall@10: 0.0
Query ID: 73752, Precision@10: 0.0
Query ID: 1368508, Recall@10: 0.12195121951219512
Query ID: 1368508, Precision

In [25]:
def calculate_MRR(query_id):
    relevant_docs =[]
    for qrel in dataset.items():
        if qrel[0] == query_id :
            for key, value in qrel[1].items():
                relevant_docs.append(key)

    ordered_results=[]
    for query in dataset.items():
        if query[0] == query_id:
            ordered_results=get_results(get_query(query_id))
            break
    
    for i, result in enumerate(ordered_results):
        if result['_id'] in relevant_docs:
            return 1 / (i+1)
    
    return 0

queries_ids = {}
for qrel in dataset.items():
    queries_ids.update({qrel[0]:''})

mrr_sum = 0
for query_id in list(queries_ids.keys()):
    mrr_sum += calculate_MRR(query_id)

print(mrr_sum / len(dataset))

0.585309523809524
