In [1]:
from hazm import *
import re

def remove_extras(word):
    to_remove = ['=', '<ref>', '</ref>', '<small>', '<font>', '</font>' , '<span>', '</span>', '</small>', '>', '<' , '"', "'", '&nbsp;', '*', ';']
    
    removed = word
    for i in to_remove:
        removed = removed.replace(i, '')
    return removed


def my_split(word):
    return re.split('\||-|_|\/|', word)

def get_words(raw_text):
    prepared_text = raw_text
    
    #Nomralize
    normalizer = Normalizer()
    prepared_text = normalizer.normalize(prepared_text)
    
    #Tokenize
    prepared_text = word_tokenize(prepared_text)
    
    #Remove Punctuations
    punctuations = ['.', '!', '،', '؛', '}', '{', ']', '[', '=', '*', '+', ':', '"', "'"]
    
    prepared_text = [remove_extras(i) for i in prepared_text if i not in punctuations]
    
    return prepared_text

def prepare_text(raw_text):
    prepared_text = []
    
    raw_text = re.split('\||-|_|', raw_text)
    i = 0
    for t in raw_text:
            
        prepared_text +=  get_words(t)
    
    #Stemming
    stemmer = Stemmer()
    prepared_text = [stemmer.stem(i) for i in prepared_text]
    return prepared_text



 

#print('Enter text:')
#raw_text = input()
#print(prepare_text(raw_text))


In [2]:
import xml.etree.ElementTree as ET


un_prepared_words = set()
def add_document_to_index(doc, index):
    doc_title = doc['title']
    doc_id = doc['id']
    doc_text = doc['text']
   
    for i in range(len(doc['title'])):
        #print(dictionary_index)
        if doc_title[i] in index.keys():
           
            if doc_id in index[doc_title[i]].keys():
                if 'title' in index[doc_title[i]][doc_id]:
                    
                    index[doc_title[i]][doc_id]['title'].append(i + 1)
                else:
                    index[doc_title[i]][doc_id]['title'] = [i + 1]
            else:
                index[doc_title[i]][doc_id] = dict()
                index[doc_title[i]][doc_id]['title'] = [i + 1]
        else:
            index[doc_title[i]] = dict()
            index[doc_title[i]][doc_id] = dict()
            index[doc_title[i]][doc_id]['title'] = [i + 1]
    for i in range(len(doc['text'])):
        #print(dictionary_index)
        if doc_text[i] in index.keys():
           
            if doc_id in index[doc_text[i]].keys():
                if 'text' in index[doc_text[i]][doc_id]:
                    
                    index[doc_text[i]][doc_id]['text'].append(i + 1)
                else:
                    index[doc_text[i]][doc_id]['text'] = [i + 1]
            else:
                index[doc_text[i]][doc_id] = dict()
                index[doc_text[i]][doc_id]['text'] = [i + 1]
        else:
            index[doc_text[i]] = dict()
            index[doc_text[i]][doc_id] = dict()
            index[doc_text[i]][doc_id]['text'] = [i + 1]
        
    return index

def construct_positional_indexes(docs_path):

    tree = ET.parse(docs_path  + '/'+ 'Persian.xml')
    root = tree.getroot() 
    
    data = [] 
    
    for child in root.iter('{http://www.mediawiki.org/xml/export-0.10/}page'):
        title_text = next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}title')).text
        splited = re.split('\||-|_|', title_text)
        for txt in splited:
            for w in get_words(txt):
                un_prepared_words.add(w)
        text_text = next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}text')).text
        splited = re.split('\||-|_|', text_text)
        for txt in splited:
            for w in get_words(txt):
                un_prepared_words.add(w)
        
        
        new_data = {'id': 0, 'title': '', 'text': ''}
        new_data['title'] = prepare_text(title_text)
        new_data['id'] = int(next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}id')).text)
        
        new_data['text'] = prepare_text(text_text)
        
        data.append(new_data)
        
    index = dict()
    for doc in data:
        index = add_document_to_index(doc, index)
    return index, data

index, db = construct_positional_indexes('data')



In [155]:
def get_posting_list(word):
    
    word = prepare_text(word)
    
    if word[0] not in index:
        print('we do not have this word in index!')
        return
        
    posting_list = index[word[0]]
   
    return posting_list

get_posting_list('انگلولاتین')
get_posting_list('جایاانکول')


{6881: {'text': [500]}}

In [3]:


def construct_bigram_index(words):
    bigram_index = dict()
    for word in words:
        indicated_word = '$' + word + '$'
        for i in range(len(indicated_word) - 1):
            bigram = indicated_word[i : i + 2]
            if bigram in bigram_index.keys():
                bigram_index[bigram].append(word)
                
            else:
                bigram_index[bigram] = [word]
     
    for b in bigram_index:
        bigram_index[b].sort()
    return bigram_index


def get_words_with_bigram(bigram):
    words = bigram_index[bigram]
    return words


bigram_index = construct_bigram_index(un_prepared_words)
get_words_with_bigram('اا')



    

In [157]:

def get_document(docs_path, doc_num):
    url = '{http://www.mediawiki.org/xml/export-0.10/}'
    tree = ET.parse(docs_path  + '/'+ 'Persian.xml')
    root = tree.getroot()  
    matchedUrl = "./"+ url  + "page/[" + url + "id='" + str(doc_num) + "']"
    for child in root.findall(matchedUrl):  
        
        #print([elem.tag for elem in child.iter()])
        un_prepared = set()
        title_text = next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}title')).text
        splited = re.split('\||-|_|', title_text)
        for txt in splited:
            for w in get_words(txt):
                un_prepared.add(w)
        text_text = next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}text')).text
        splited = re.split('\||-|_|', text_text)
        for txt in splited:
            for w in get_words(txt):
                un_prepared.add(w)
        
        new_data = {'id': 0, 'title': '', 'text': ''}
        new_data['title'] = prepare_text(next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}title')).text)
        
        new_data['id'] = int(next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}id')).text)
        new_data['text'] = prepare_text(next(child.iter('{http://www.mediawiki.org/xml/export-0.10/}text')).text)
        
        
    return new_data, un_prepared
def add_document_to_indexes(docs_path, doc_num):
    
    new_data, un_prepared = get_document(docs_path, doc_num)
    if not new_data in db:
          
        new_index = add_document_to_index(new_data, index)
        
        db.append(new_data)
    else:
        print('we have this data in index now!')
        return index
    return new_index
#index = add_document_to_indexes('data', 4589) 
index = add_document_to_indexes('data', 6881) 
#جایاانکول

we have this data in index now!


In [159]:
def find_all_words(text, title):
    all_words = set()
    for word in text:
        all_words.add(word)
    for word in title:
        all_words.add(word)
   
    return all_words
    

def delete_from_bigram(words):
    
    for word in words:
        pw = prepare_text(word)
        if len(pw) > 0 and pw[0] not in index:
           
            indicated_word = '$' + word + '$'
            for i in range(len(indicated_word) - 1):
                bigram = indicated_word[i : i + 2]
                
                bigram_index[bigram].remove(word)

def delete_document_from_indexes(docs_path, doc_num):
    
    new_data, unprepared = get_document(docs_path, doc_num)
    if not new_data in db:
        print('we do not have this data in index now!')
        return
    else:
        
        
        
        all_words = find_all_words(new_data['text'], new_data['title'])        
        for i in all_words:
            del(index[i][new_data['id']])
            if len(index[i].keys()) == 0:
                del(index[i]) 
                
        delete_from_bigram(unprepared)
        
        db.remove(new_data)
        

#delete_document_from_indexes('data', 4589)
delete_document_from_indexes('data', 6881)

#print(get_posting_list('انگولاتین'))


In [158]:
import pickle
def save_index(destination):
    #with open(destination ,"w") as f:
        #f.write(str(index))
    pickle_out = open(destination , 'wb')
    pickle.dump(index, pickle_out)
    pickle_out.close()
    

save_index('storage/index_backup')


In [161]:
import pickle
def load_index(source):
    pickle_in = open(source, 'rb')
    index = pickle.load(pickle_in)
    return index
    
index = load_index('storage/index_backup')


In [162]:
def edit_distance(a, b):
    dp = [[0 for i in range(len(a) + 1)] for j in range(len(b) + 1)]
    
    for i in range(1, len(a) + 1):
        dp[0][i] = i
    for j in range(1, len(b) + 1):
        dp[j][0] = j
    for j in range(1, len(b) + 1):
        for i in range(1, len(a) + 1):
            matching_score = 0
            if a[i - 1] != b[j - 1]:
                matching_score = 1
            dp[j][i] = min(dp[j - 1][i] + 1, dp[j][i - 1] + 1, dp[j - 1][i - 1] + matching_score)
   
    #print(a, b, dp[-1][-1])
    return dp[-1][-1]

def edit_distance_select(query, jacard_selected):
    current = 0
    current_score = edit_distance(query, jacard_selected[0])
    
    for i in range(1, len(jacard_selected)):
        new_score = edit_distance(query, jacard_selected[i])
        if new_score < current_score:
            current_score = new_score
            current = i
    return jacard_selected[current]

def find_minim(pointers, bigrams):
    current_words = [[bigrams[i][pointers[i]], i] for i in range(len(pointers)) if pointers[i] < len(bigrams[i]) ]
   
    minim = [i[1] for i in current_words if i[0] == min(current_words)[0]]
    
    #print(min(current_words), current_words)
    return minim
    
    
def calculate_jacard(query, word, common_bigrams):
    jacard_distance = common_bigrams / (len(query) + len(word) + 2 - common_bigrams)
    return jacard_distance

def jacard_selected(query):
    threshhold = 0.42
    
    selected_words = []
    bigrams = []
    indicated_word = '$' + query + '$'
    for i in range(len(indicated_word) - 1):
        bigrams.append(bigram_index[indicated_word[i : i + 2]])
    pointers = [0 for i in range(len(bigrams))]

    while True:
        minim = find_minim(pointers, bigrams)
        
       # print(pointers[minim[0]], minim)
     
        jacard_distance = calculate_jacard(query, bigrams[minim[0]][pointers[minim[0]]], len(minim))
        
        if jacard_distance > threshhold:
        
            
            selected_words.append(bigrams[minim[0]][pointers[minim[0]]])
        for index in minim:
            pointers[index] += 1
        num = 0
        for i in range(len(pointers)):
            if pointers[i] >= len(bigrams[i]):
                num += 1
        
        if num == len(bigrams):
            break
    return selected_words
    


def correct_query(query):
    
    
    words = get_words(query)
   
    jacard_select = []
    for i in range(len(words)):
        if words[i] not in un_prepared_words:
            
            jacard_select = jacard_selected(words[i])
            
            replaced_word = edit_distance_select(words[i], jacard_select)
            words[i] = replaced_word
    
    
    #correct_query = "سلام حالا پرسمان درست شد"
  
    correct_query = ' '.join(words)
    return correct_query
#پرسمان = 0.2727272727272727
correct_query("شلام حالا برسهان درسک شد")
#correct_query('خاورمینا')



'آلام حالا برسلان ارسک شد'

In [5]:
import math

corpus_length = len(index.keys())
def get_tf(index, word, doc_id):
    
    tfs = {'title': 0, 'text': 0}
    if doc_id not in index[word]:
        return tfs
    else:
        
        for key in index[word][doc_id].keys():
            tfs[key] = len(index[word][doc_id][key])
    #print(tfs)
    return tfs

def get_df(index, word):
    dfs = {'title': 0, 'text': 0}
    
    if word not in index.keys():
        return dfs
    
    else:  
        for doc_id in index[word]:
            
            for key in index[word][doc_id]:
                dfs[key] += len(index[word][doc_id][key])
                
    
    return dfs

def get_ltf(tf):
    logarithmic_tf = dict(tf)
    for key in logarithmic_tf.keys():
        if logarithmic_tf[key] != 0:
            logarithmic_tf[key] = 1 + math.log(logarithmic_tf[key], 10)
    
    
    return logarithmic_tf
    
def get_idf(df):
    #division by zero!
    idf = dict(df)
    for key in idf.keys():
        if idf[key] != 0:
            idf[key] =  math.log(corpus_length / idf[key], 10)
    
    
    return idf
def normalizer(vector):
    
    norm_two = 0
    for i in vector:
        norm_two += i**2
    if norm_two == 0:
        return vector
    norm_two = math.sqrt(norm_two)
    normalized = [(i / norm_two) for i in vector]
    
    return normalized

def calculate_total_score(tf, idf, weight):
    #print(tf, idf)
    score = 0
    score += tf['title'] * weight * idf['title']
    score += tf['text'] * idf['text']
    return score


def get_qoutation_indexes(query):
    qoutaton_indexes = [i for i in range(len(query)) if query[i] == '"']
    return qoutaton_indexes
    
def get_qouted_and_unqouted(query):
    qoutation_indexes = get_qoutation_indexes(query)
    
    qouted = [] 
    unqouted = []
    for i in range(len(qoutation_indexes) - 1):
        if i % 2 == 0:
            qouted.append(query[qoutation_indexes[i] + 1: qoutation_indexes[i + 1] + 1])
        else:
            unqouted.append(query[qoutation_indexes[i] + 1: qoutation_indexes[i + 1]])
    if qoutation_indexes[0] != 0:
      
        unqouted.append(query[:qoutation_indexes[0]])
    if qoutation_indexes[-1] != len(query) - 1:
        unqouted.append(query[qoutation_indexes[-1]: ])
    return qouted, unqouted

def is_qouted(doc, qouted, search='text'):#search equals title or text
    places = []
    for couples in qouted:
        prepared_couples = prepare_text(couples)
        for word in prepared_couples:
            if doc['id'] not in index[word].keys():
                return False
            if search not in index[word][doc['id']].keys():
                return False
            
          
            places.append(index[word][doc['id']][search])
        for i in places[0]:
            flag = True
            for j in range(1, len(places)):
                if i + j not in places[j]:
                    flag = False
                    break
            if flag:
                return True 
    return False
    
def find_qouted_docs(db, qouted, search='all'):#finds docs that contain qouted parts
    qouted_docs = []
    if search == 'all':
        for doc in db:
            if is_qouted(doc, qouted, 'text') or is_qouted(doc, qouted, 'title'):
                qouted_docs.append(doc)
    else:
        for doc in db:
            if is_qouted(doc, qouted, search):
                qouted_docs.append(doc)
    return qouted_docs

def get_selected_docs(query, search='all'):
    selected_docs = list(db)
    
    if search == 'all':
        if '"'  in query:
            qouted, unqouted = get_qouted_and_unqouted(query)
          
            selected_docs = find_qouted_docs(db, qouted, 'all')
    else:
        if '"'  in query:
            qouted, unqouted = get_qouted_and_unqouted(query)
            selected_docs = find_qouted_docs(db, qouted, search)
    return selected_docs

def check_word_in_dictionary(word):
    if word in index:
        return True

def search(query, method="ltn-lnn", weight=2):
 
    selected_docs = get_selected_docs(query, 'all')
    query_by_word = prepare_text(query)
    w = [0 for i in range(len(query_by_word))]
    
    scores = []
    
    relevant_docs = []
    for doc in selected_docs:
        
        scores = []
        for q in query_by_word:
           
            if q not in index:
                score = 0
               
            
            else:
                tfs = get_tf(index, q, doc['id'])    
                ltf = get_ltf(tfs)
            

                dfs = get_df(index, q)
                idf = get_idf(dfs)
            
          


            scores.append(calculate_total_score(ltf, idf, weight))
        
        if method == 'ltc-lnc':
            scores = normalizer(scores)
        scores = sum(scores)   
        relevant_docs.append([scores,doc['id']])
        relevant_docs.sort(reverse=True)
        relevant_docs = relevant_docs[:15]
        
    return relevant_docs
#print(normalizer([1.3,2,3]))
#search('نظرخواهی انجام شده توسط دانشگاه "شهر نیویورک"', "ltc-lnc", 3)


#search('سیاره های بزرگ "منظومه شمسی"', 'ltc-ln', 2)
#search('کشورهای دارای نفت در خاورزمین', 'ltc-lnc', 2)
#print(index['نیویورک'])

                                  
                            

In [164]:
def search_in_part(query, doc, search, method):
    
    
    query_by_word = prepare_text(query)
    w = [0 for i in range(len(query_by_word))]
    scores = []
    relevant_docs = []
    
        
    scores = []
    for q in query_by_word:
           
        if q not in index:
            scores.append(ltf * idf)
        else:
            tfs = get_tf(index, q, doc['id'])    
            ltf = get_ltf(tfs)[search]


            dfs = get_df(index, q)
            idf = get_idf(dfs)[search]

            scores.append(ltf * idf)
        
    if method == 'ltc-lnc':
        scores = normalizer(scores)
    scores = sum(scores)   
    return scores
        

def detailed_search(title_query, text_query, method="ltn-lnn"):
    selected_docs_title = get_selected_docs(title_query, 'title')
    selected_docs_text = get_selected_docs(text_query, 'text') 
    selected_docs = [i for i in selected_docs_title if i in selected_docs_text]
    
    score = 0
    relevant_docs = []
    for doc in selected_docs:
        score = search_in_part(title_query, doc, 'title', method)
        score += search_in_part(text_query, doc, 'text', method)
        
        relevant_docs.append([score,doc['id']])
        relevant_docs.sort(reverse=True)
        relevant_docs = relevant_docs[:15]
    
    return relevant_docs

detailed_search('عجایب هفت‌گانه', 'چشمگیرترین بناهای تاریخی جهان', "ltc-lnc")

[[3.394651788255002, 3854],
 [1.9887278320066466, 3938],
 [1.9859230291050913, 6752],
 [1.9849580423318987, 3120],
 [1.9788123049694293, 6917],
 [1.9765801313596059, 7143],
 [1.9762403887145932, 3260],
 [1.9750469063412226, 5192],
 [1.9725214974634648, 5967],
 [1.9715655489522068, 6949],
 [1.96775977887812, 4401],
 [1.966045725654069, 4094],
 [1.9650869486204239, 7100],
 [1.9613542630191763, 5309],
 [1.958951499177989, 3667]]

In [165]:
import math
import glob



def get_reterived_and_relevant(query_id, method):
    with open('./data/queries/%s.txt'%(query_id,), encoding='utf-8') as query_file:
        query = query_file.readlines()
        
        if len(query) == 1:
            #print(query[0])
            if not '"' in query[0]:
                query[0] = correct_query(query[0])
          
            #print(query[0])
            reterived = search(query[0], method, weight=2 ) #[score, doc_id]
            reterived_ids = [i[1] for i in reterived]
            
            
        else:
            reterived = detailed_search(query[0], query[1], method)
            reterived_ids = [i[1] for i in reterived]
    with open('./data/relevance/%s.txt'%(query_id,)) as relevance_file:
        relevant = list(map(int, relevance_file.read().split(',')))
    return reterived_ids, relevant
def calculate_precision(reterived, relevant):
    tp = len([i for i in reterived if i in relevant])
    precision = tp / len(reterived)
    return precision

def calculate_recall(reterived, relevant):
    tp = len([i for i in reterived if i in relevant])
    recall = tp / len(relevant)
    return recall
    

def R_Precision(query_id='all', method='ltn-lnn'):
    
    
    result = 0
    if query_id == 'all':
        num_of_queries = 20
        for i in range(1, num_of_queries + 1):
            reterived, relevant = get_reterived_and_relevant(i, method)
            result += calculate_precision(reterived, relevant)
            #print(calculate_precision(reterived, relevant))
            
        result = result / num_of_queries
    else:
        reterived, relevant = get_reterived_and_relevant(query_id)
        result = calculate_precision(reterived, relevant)
        
        
                  
    print('Precision is: ' + str(result))
    return result

def F_measure(query_id='all', method='ltn-lnn'):
    result = 0
    recall = 0
    precision = 0
    if query_id == 'all':
        num_of_queries = 20
        for i in range(1, num_of_queries + 1):
            reterived, relevant = get_reterived_and_relevant(i, method)
            precision += calculate_precision(reterived, relevant)
            recall += calculate_recall(reterived, relevant)
        recall = recall / num_of_queries
        precision = precision / num_of_queries
    else:
        reterived, relevant = get_reterived_and_relevant(query_id)
        precision = calculate_precision(reterived, relevant)
        recall = calculate_recall(reterived, relevant)
    
    #setting alpha = 0.5, beta = 1
    beta = 1
    if (recall + precision) != 0: 
        result = (2 * precision * recall) / (recall + precision)
    else:
        result = 0
    print('F_measure is: ' + str(result))
    return result

def calculate_map(reterived, relevant):
    mapp = 0
    tp = len([i for i in reterived if i in relevant])
    for i in range(len(reterived)):
        if reterived[i] in relevant:
            mapp += calculate_precision(reterived[:i + 1], relevant)
    if tp == 0:
        return 0
    mapp = mapp / tp
    return mapp
    
def MAP(query_id='all', method='ltn-lnn'):
    
    result = 0
    if query_id == 'all':
        num_of_queries = 20
        for i in range(1, num_of_queries + 1):
            reterived, relevant = get_reterived_and_relevant(i, method)
            result += calculate_map(reterived, relevant)
        result = result / num_of_queries
    else:
        reterived, relevant = get_reterived_and_relevant(query_id)
        result = calculate_map(reterived, relevant)
                  
    print('MAP is: ' + str(result))
    return result

def calculate_dcg(relevant, relevance_score):
    dcg = 0
    
    for i in range(len(relevance_score)):
        dcg += relevance_score[i] / (math.log(i + 2, 2))
    return dcg

def NDCG(query_id='all', method='ltn-lnn'):
    
    #base = 2, relevant:1 , non-relevant:0
    result = 0
    if query_id == 'all':
        num_of_queries = 20
        for i in range(1, num_of_queries + 1):
            reterived, relevant = get_reterived_and_relevant(i, method)
            
            #k = length relevant, r_i > 0 for all docs in relevant
            k = min(len(relevant), len(reterived))
            
            ideal = calculate_dcg(relevant[:k], [1 for i in range(k)])
            
            
            dcg = calculate_dcg(reterived[:k], [1 if i in relevant else 0 for i in reterived][:k])
            result += (dcg / ideal)
        result = result / num_of_queries
    else:
        reterived, relevant = get_reterived_and_relevant(query_id)
        ideal = calculate_dcg(relevant, [1 for i in range(len(relevant))])
        #k = length relevant, r_i > 0 for all docs in relevant
        k = len(relevant)
        dcg = calculate_dcg(reterived, [1 if i in relevant else 0 for i in reterived][:k])
        result = dcg / ideal
        
        
                  
    print('NDCG is: ' + str(result))
    return result

#shomare 18 --> خاورمینا

R_Precision('all', 'ltc-lnc')
F_measure('all', 'ltc-lnc')
MAP('all', 'ltc-lnc')
NDCG('all', 'ltc-lnc')


Precision is: 0.6533333333333332
F_measure is: 0.6934880384965016
MAP is: 0.8360511316523846
NDCG is: 0.7714660920937972


0.7714660920937972