In [3]:
from Phase_1.src.Utils.SimplePositionalIndex import SimplePositionalIndex
from Phase_2.src.config import Config
from math import log

class DevelopedPositionalIndex(SimplePositionalIndex):
    def __init__(self, configurations):
        super().__init__(configurations)
        self.total_number_of_documents = len(self.Documents)
        self.document_term_tfidf_dictionary = {}
        self.build_updated_positional_index()
        if configurations.get_config('champions_list'):
            self.champions_list = self.build_champions_list()

    def build_updated_positional_index(self):
        for WORD in self.positional_index_structure.keys():
            for DOC_URL, DICTIONARY in self.positional_index_structure[WORD]['indexes'].items():
                DICTIONARY['tf idf'] = self.get_tf_value(WORD, DOC_URL) * self.get_idf_value(WORD)
                if DOC_URL not in self.document_term_tfidf_dictionary.keys():
                    self.document_term_tfidf_dictionary[DOC_URL] = {WORD: DICTIONARY['tf idf']}
                else:
                    self.document_term_tfidf_dictionary[DOC_URL][WORD] = DICTIONARY['tf idf']

    def get_tf_value(self, word, url):
        return 1 + log(self.positional_index_structure[word]['indexes'][url]['number of occurrences in document'])

    def get_idf_value(self, word):
        return log(self.total_number_of_documents / len(self.positional_index_structure[word]['indexes']))

    def build_champions_list(self):
        champions_list = {}
        for WORD in self.positional_index_structure.keys():
            url_tf_dictionary = {}
            for DOC_URL, DICTIONARY in self.positional_index_structure[WORD]['indexes'].items():
                url_tf_dictionary[DOC_URL] = self.get_tf_value(WORD, DOC_URL)
            champions_list[WORD] = sorted(url_tf_dictionary, key=lambda item: item[1], reverse=True)[
                                   :self.config.get_config('champions_list_size')]
        return champions_list


In [4]:
from Phase_1.src.Utils.StopWord import Document
from Phase_1.src.Utils.utilities import read_file

config = Config()
docs_url, docs_title, docs_content = read_file()

config.set_config('documents',
                  [Document(url, title, content) for url, title, content in zip(docs_url, docs_title, docs_content)])

pos_index = DevelopedPositionalIndex(config)

for word in pos_index.positional_index_structure.keys():
    for doc_url, dictionary in pos_index.positional_index_structure[word]['indexes'].items():
        print(dictionary['tf idf'])

0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.0018046104479097456
0.00180461

In [16]:
from collections import Counter
from math import log

from Phase_1.src.Utils.utilities import preprocess_pipeline


class QueryHandler:
    def __init__(self, positional_index, config):
        self.positional_index = positional_index
        self.config = config

    def answer_query(self, query):
        query = preprocess_pipeline(query)
        terms = query.split()
        if not self.config.get_config('champions_list'):
            vector_values = self.tf_idf_calculate_normal(terms)
        else:
            vector_values = self.tf_idf_calculate_champions(terms)
        scores = self.calculate_scores(vector_values)
        return dict(sorted(scores.items(), key=lambda item: item[1], reverse=True)[
               :self.config.get_config('documents_to_show')])

    def tf_idf_calculate_normal(self, terms):
        vector_values = {}
        tf_values = Counter(terms)
        for term in terms:
            positional_index_structure = self.positional_index.positional_index_structure
            if term in positional_index_structure.keys():
                for DOC_URL in positional_index_structure[term]['indexes'].keys():
                    if DOC_URL not in vector_values.keys():
                        vector_values[DOC_URL] = {}
                    vector_values[DOC_URL][term] = \
                        (1 + log(tf_values[term])) * self.positional_index.get_idf_value(term)
        return vector_values

    def tf_idf_calculate_champions(self, terms):
        vector_values = {}
        tf_values = Counter(terms)
        for term in terms:
            if term in self.positional_index.positional_index_structure.keys():
                for DOC_URL in self.positional_index.champions_list[term]:
                    if DOC_URL not in vector_values.keys():
                        vector_values[DOC_URL] = {}
                    vector_values[DOC_URL][term] = \
                        (1 + log(tf_values[term])) * self.positional_index.get_idf_value(term)
        return vector_values

    @staticmethod
    def cosine_similarity(v1, v2):
        dot_product = 0
        for term in v1.keys():
            if term in v2.keys():
                dot_product += v1[term] * v2[term]
        magnitude_v1 = 0
        for term in v1.keys():
            magnitude_v1 += v1[term] ** 2
        magnitude_v2 = 0
        for term in v2.keys():
            magnitude_v2 += v2[term] ** 2
        return dot_product / (magnitude_v1 ** 0.5) / (magnitude_v2 ** 0.5)

    def calculate_scores(self, vector_values):
        scores = {}
        for DOC_URL, TERM_SCORES in vector_values.items():
            scores[DOC_URL] = self.cosine_similarity(vector_values[DOC_URL],
                                                     self.positional_index.document_term_tfidf_dictionary[DOC_URL])
        return scores


In [17]:
query_handler = QueryHandler(pos_index, config)
query_handler.answer_query('انقلاب اسلامی ایران')

{'نادری:\xa0اتاق\xa0بازرگانی\xa0ایران\xa0و\xa0برزیل\xa0به منظور\xa0گسترش\xa0روابط\xa0اقتصادی\xa0دو\xa0کشور افتتاح شد': 0.40824828545833003}