In [17]:

from math import log

from Phase_1.src.Utils.SimplePositionalIndex import SimplePositionalIndex
from Phase_1.src.Utils.utilities import read_file

docs_url, docs_title, docs_content = read_file()


class DevelopedPositionalIndex(SimplePositionalIndex):
    def __init__(self, documents_url, documents_title, documents_content):
        super().__init__(documents_url, documents_title, documents_content, False)
        self.total_number_of_documents = len(documents_url)
        self.document_term_tfidf_dictionary = {}
        self.build_updated_positional_index()

    def build_updated_positional_index(self):
        for WORD in self.positional_index_structure.keys():
            number_of_unique_occurrences = len(self.positional_index_structure[WORD]['indexes'])
            for TERM_URL, DICTIONARY in self.positional_index_structure[WORD]['indexes'].items():
                DICTIONARY['tf idf'] = self.get_tf_value(WORD, TERM_URL) * self.get_idf_value(WORD)
                if TERM_URL not in self.document_term_tfidf_dictionary.keys():
                    self.document_term_tfidf_dictionary[TERM_URL] = {WORD: DICTIONARY['tf idf']}
                else:
                    self.document_term_tfidf_dictionary[TERM_URL][WORD] = DICTIONARY['tf idf']

    def get_tf_value(self, word, url):
        return 1 + log(self.positional_index_structure[word]['indexes'][url]['number of occurrences in document'])

    def get_idf_value(self, word):
        return log(self.total_number_of_documents / len(self.positional_index_structure[word]['indexes']))

pos_index = DevelopedPositionalIndex(documents_url=docs_url, documents_title=docs_title, documents_content=docs_content)


In [18]:
for word in pos_index.positional_index_structure.keys():
    for doc_url, dictionary in pos_index.positional_index_structure[word]['indexes'].items():
        print(dictionary['tf idf'])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



8.716207971151853
8.716207971151853
8.310742863043687
8.310742863043687
8.310742863043687
14.071310846921108
8.310742863043687
8.310742863043687
15.93142314600803
15.93142314600803
9.409355151711798
8.716207971151853
8.716207971151853
9.409355151711798
9.409355151711798
9.409355151711798
9.409355151711798
15.93142314600803
8.716207971151853
8.716207971151853
8.023060790591908
8.023060790591908
8.023060790591908
8.023060790591908
9.409355151711798
9.409355151711798
7.799917239277697
7.799917239277697
7.799917239277697
7.799917239277697
7.799917239277697
15.93142314600803
9.409355151711798
9.409355151711798
9.409355151711798
15.93142314600803
9.409355151711798
9.409355151711798
8.716207971151853
14.757822951529882
9.409355151711798
9.409355151711798
7.799917239277697
7.799917239277697
7.799917239277697
7.799917239277697
7.799917239277697
9.409355151711798
9.409355151711798
8.716207971151853
8.716207971151853
7.799917239277697
7.799917239277697
7.799917239277697
7.799917239277697
7.799917

In [34]:
from collections import Counter
from math import log

from Phase_1.src.Utils.utilities import preprocess_pipeline


class QueryHandler:
    def __init__(self, positional_index):
        self.positional_index = positional_index

    def answer_query(self, query):
        query = preprocess_pipeline(query)
        terms = query.split()
        vector_values = self.tf_idf_calculate(terms)
        scores = self.calculate_scores(vector_values)
        return dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    def tf_idf_calculate(self, terms):
        vector_values = {}
        tf_values = Counter(terms)
        for term in terms:
            positional_index_structure = self.positional_index.positional_index_structure
            if term in positional_index_structure.keys():
                for DOC_URL in positional_index_structure[term]['indexes'].keys():
                    if DOC_URL not in vector_values.keys():
                        vector_values[DOC_URL] = {}
                    vector_values[DOC_URL][term] = \
                        (1 + log(tf_values[term])) * self.positional_index.get_idf_value(term)
        return vector_values

    @staticmethod
    def cosine_similarity(v1, v2):
        dot_product = 0
        for term in v1.keys():
            if term in v2.keys():
                dot_product += v1[term] * v2[term]
        magnitude_v1 = 0
        for term in v1.keys():
            magnitude_v1 += v1[term] ** 2
        magnitude_v2 = 0
        for term in v2.keys():
            magnitude_v2 += v2[term] ** 2
        return dot_product / (magnitude_v1 ** 0.5) / (magnitude_v2 ** 0.5)

    def calculate_scores(self, vector_values):
        scores = {}
        for DOC_URL, TERM_SCORES in vector_values.items():
            scores[DOC_URL] = self.cosine_similarity(vector_values[DOC_URL],
                                                     self.positional_index.document_term_tfidf_dictionary[DOC_URL])
        return scores


In [36]:
query_handler = QueryHandler(pos_index)

query_handler.answer_query('انقلاب اسلامی ایران')

{'https://www.farsnews.ir/news/14001126000163/وزیر-کشور-دستاوردها-و-پیشرفت\u200cهای-انقلاب-برای-عموم-تبیین-شود': 0.1994221263699708,
 'https://www.farsnews.ir/news/14001110000669/مردم-ما-در-دهه-فجر-نشان-می\u200cدهند-پای-آرمان\u200cهای-خود-ایستاده\u200cاند': 0.19521242890553808,
 'https://www.farsnews.ir/news/14001223000745/زمین-مجموعه-انقلاب-مهیای-تمرینات-استقلال-شد': 0.18915647797307608,
 'https://www.farsnews.ir/news/14001118000270/فرماندهان-ارشد-نیروهای-مسلح-با-آرمان\u200cهای-امام-خمینی-ره-تجدید-میثاق': 0.1765947077093096,
 'https://www.farsnews.ir/news/14001113000442/انقلاب-عدالت-محوری-فساد-ستیزی|-مبارزه-با-فساد-و-عدالت-باید-به\u200cصورت': 0.17530286250628874,
 'https://www.farsnews.ir/news/14000921000688/وحیدی-فرزندان-شهدا-از-ستون\u200cهای-اصلی-انقلاب-هستند': 0.17226894878391824,
 'https://www.farsnews.ir/news/14001119000716/انقلاب-اسلامی-عزت-عقلانیت-تجلی-وحدت-ملی|-دل\u200cبریدگان-از-انقلاب': 0.17221523070030711,
 'https://www.farsnews.ir/news/14000921000552/با-حکم-رئیس\u200cجمهور