In [1]:
from collections import defaultdict
import itertools
import numpy as np

In [3]:
def read_words_base_forms():
    words_base_forms = defaultdict(lambda: [])
    with open("base_forms.txt", "r") as fstream:
        bases_words = map(lambda x: x.split(";")[0:2], fstream.readlines())
        for base, word in bases_words:
            words_base_forms[word].append(base)
    return dict(words_base_forms)


def sample_two_grams(sample_probability):
    two_grams = []
    for line in open("2grams.txt", "r"):
        if np.random.rand() > sample_probability:
            continue
        two_grams.append(tuple(line.strip().split(' ')[1:3]))
    return two_grams
    

def sample_three_grams(sample_probability):
    three_grams = []
    for line in open("3grams.txt", "r"):
        if np.random.rand() > sample_probability:
            continue
        three_grams.append(tuple(line.strip().split(' ')[1:4]))
    return three_grams 

In [4]:
class DisjointSet(object):
    def __init__(self, vertex):
        self.vertex = vertex
        self.parent = self
        self.rank = 0
        
    @staticmethod
    def find_representative(disjoint_set):
        if disjoint_set != disjoint_set.parent:
            disjoint_set.parent = DisjointSet.find_representative(disjoint_set.parent)
        return disjoint_set.parent
    
    @staticmethod
    def union_sets(disjoint_set1, disjoint_set2):
        repr1 = DisjointSet.find_representative(disjoint_set1)
        repr2 = DisjointSet.find_representative(disjoint_set2)
        if repr1.rank > repr2.rank:
            repr2.parent = repr1
        elif repr1.rank < repr2.rank:
            repr1.parent = repr2
        else:
            repr1.parent = repr2
            repr2.rank += 1

In [5]:
def union_base_forms(joined_base_forms, base_forms_disjoint_sets):
    for base_forms in joined_base_forms:
        disjoint_set1 = base_forms_disjoint_sets[base_forms[0]]
        for i in range(1, len(base_forms)):
            disjoint_set2 = base_forms_disjoint_sets[base_forms[i]]
            DisjointSet.union_sets(disjoint_set1, disjoint_set2)    


def reverse_base_forms_disjoint_sets(base_forms_disjoint_sets):
    disjoint_sets_base_forms = defaultdict(lambda: [])
    for base_form, disjoint_set in base_forms_disjoint_sets.items():
        representative = DisjointSet.find_representative(disjoint_set)
        disjoint_sets_base_forms[representative].append(base_form)    
    return dict(disjoint_sets_base_forms)


def create_base_forms_superbase_forms(words_base_forms):
    all_base_forms = set(itertools.chain(*words_base_forms.values()))    
    base_forms_disjoint_sets = {x: DisjointSet(x) for x in all_base_forms}
    union_base_forms(words_base_forms.values(), base_forms_disjoint_sets)
    disjoint_sets_base_forms = reverse_base_forms_disjoint_sets(base_forms_disjoint_sets)
    base_forms_superbase_forms = {}
    for joined_base_forms in disjoint_sets_base_forms.values():
        superbase_form = '|'.join(sorted(joined_base_forms))
        for joined_base_form in joined_base_forms:
            base_forms_superbase_forms[joined_base_form] = superbase_form
    return base_forms_superbase_forms

In [40]:
def analyze_base_forms_in_common_superbase_form(words_base_forms, word_ngrams):
    for word_index in range(len(word_ngrams[0])):
        for i in range(len(word_ngrams)):
            for j in range(i + 1, len(word_ngrams)):
                word_ngram1 = word_ngrams[i]
                word_ngram2 = word_ngrams[j]
                word1_base_forms = words_base_forms[word_ngram1[word_index]]
                word2_base_forms = words_base_forms[word_ngram2[word_index]]
                if len(set(word1_base_forms).intersection(set(word2_base_forms))) == 0:
                    joined_ngram1 = ' '.join(word_ngram1)
                    joined_ngram2 = ' '.join(word_ngram2)
                    print(f"Found ngrams: '{joined_ngram1}' & '{joined_ngram2}'")
    

def analyze_base_forms_in_superbase_forms(word_ngrams):
    words_base_forms = read_words_base_forms()
    base_forms_superbase_forms = create_base_forms_superbase_forms(words_base_forms)    
    superbase_forms_word_ngrams = defaultdict(lambda: [])
    for word_ngram in word_ngrams:
        if any([x not in words_base_forms for x in word_ngram]):
            continue
        ngram_superbase_forms = list(map(
            lambda x: base_forms_superbase_forms[words_base_forms[x][0]],
            word_ngram
        ))
        ngram_superbase_form = ' '.join(ngram_superbase_forms)
        superbase_forms_word_ngrams[ngram_superbase_form].append(word_ngram)
    for word_ngrams in superbase_forms_word_ngrams.values():
        analyze_base_forms_in_common_superbase_form(words_base_forms, word_ngrams)    

In [22]:
two_grams = sample_two_grams(sample_probability=0.01)
three_grams = sample_three_grams(sample_probability=0.01)

In [41]:
analyze_base_forms_in_superbase_forms(two_grams + three_grams)

Found ngrams: 'ojciec ich' & 'ocean jego'
Found ngrams: 'działa w' & 'dzielony we'
Found ngrams: 'ile to' & 'ile ty'
Found ngrams: 'jak ona' & 'jakimi ich'
Found ngrams: 'jak one' & 'jakimi ich'
Found ngrams: 'mieszczą się' & 'ma sobie'
Found ngrams: 'mieszczą się' & 'mieni się'
Found ngrams: 'mieszczą się' & 'mieliście się'
Found ngrams: 'mieszczą się' & 'miałeś sobie'
Found ngrams: 'ma sobie' & 'mieni się'
Found ngrams: 'ma sobie' & 'mieszczeniu się'
Found ngrams: 'ma sobie' & 'mieściłaś się'
Found ngrams: 'mieni się' & 'mieliście się'
Found ngrams: 'mieni się' & 'miałeś sobie'
Found ngrams: 'mieni się' & 'mieszczeniu się'
Found ngrams: 'mieni się' & 'mieściłaś się'
Found ngrams: 'mieliście się' & 'mieszczeniu się'
Found ngrams: 'mieliście się' & 'mieściłaś się'
Found ngrams: 'miałeś sobie' & 'mieszczeniu się'
Found ngrams: 'miałeś sobie' & 'mieściłaś się'
Found ngrams: 'których jest' & 'któremu bardzo'
Found ngrams: 'któremu bardzo' & 'którą będziesz'
Found ngrams: 'któremu bardzo' 