In [12]:
import pandas as pd
import itertools
from collections import defaultdict

In [42]:
def read_words_base_forms():
    words_base_forms = defaultdict(lambda: [])
    with open("base_forms.txt", "r") as fstream:
        bases_words = map(lambda x: x.split(";")[0:2], fstream.readlines())
        for base, word in bases_words:
            words_base_forms[word].append(base)
    return dict(words_base_forms)

    
def read_quotes():
    with open("tokenized_quotes.txt", "r") as fstream:
        return list(map(lambda x: x.strip(), fstream.readlines()))


def read_trigrams():
    with open("trigrams.txt", "r") as file_stream:
        return list(map(lambda x: x.strip(), file_stream.readlines()))
    
print("bolała mnie głowa" in set(read_trigrams()))

True


In [33]:
class QuotesIndex(object):
    
    def __init__(self, words_base_forms, quotes):
        self.words_base_forms = words_base_forms
        self.quotes = quotes
        self.index_of_quotes = self._generate_index_of_quotes()

    def _generate_base_forms(self, sentence):
        words = sentence.split(' ')
        base_forms = []
        for word in words:
            if word in self.words_base_forms:
                base_forms.extend(self.words_base_forms[word])
            else:
                base_forms.append(word)
        return set(base_forms)

    def _generate_index_of_quotes(self):
        index_of_quotes = defaultdict(lambda: set())
        for index, quote in enumerate(self.quotes):
            base_forms = self._generate_base_forms(quote)
            for base_form in base_forms:
                index_of_quotes[base_form].add(index)
        return index_of_quotes

    def _generate_matching_quotes_indexes(self, query):
        words = query.split(" ")
        word_base_forms = {x: self._generate_base_forms(x) for x in words}
        query_parts = []
        for word, base_forms in word_base_forms.items():
            formatted_forms = map(lambda x: f"index['{x}']", base_forms)
            joined_base_forms = " | ".join(formatted_forms)
            query_parts.append(f"({joined_base_forms})")
        generated_query = " & ".join(query_parts)
        return eval(generated_query, {"index": self.index_of_quotes})                    
    
    def query_index(self, query):
        matching_indexes = self._generate_matching_quotes_indexes(query)
        return list(map(lambda x: self.quotes[x], matching_indexes))


quotes = read_quotes()
words_base_forms = read_words_base_forms()
quotes_index = QuotesIndex(words_base_forms, quotes)

In [None]:
quotes_index.query_index("będą to pieniądze")

In [44]:
words_base_forms["mnie"]

['ja', 'miąć']