In [53]:
import pandas as pd
import itertools
import numpy as np
import re
from collections import defaultdict, Counter

In [109]:
def clean_word(word):
    regex_match = re.search("([a-zA-Z]+)", word)
    if regex_match is None:
        return None
    return regex_match.group(1)


def read_one_grams():
    with open("1grams.txt", "r") as file_stream:
        one_grams = [clean_word(x.strip().split(" ")[1]) for x in file_stream.readlines()]
        return [x for x in one_grams if x is not None and len(x) > 0]
    

def read_words_base_forms():
    words_base_forms = defaultdict(lambda: [])
    with open("base_forms.txt", "r") as fstream:
        bases_words = map(lambda x: x.split(";")[0:2], fstream.readlines())
        for base, word in bases_words:
            words_base_forms[word].append(base)
    return dict(words_base_forms)


def fetch_correction_words(word):
    cleaned_words = [clean_word(x) for x in word.split(".")]
    if len(cleaned_words) == 1:
        return []
    return [word for word in cleaned_words if word is not None and len(word) > 0]


def read_two_grams_with_correction_words():
    with open("2grams.txt", "r") as file_stream:
        dirty_two_grams = [tuple(x.strip().split(" ")[1:3]) for x in file_stream.readlines()]
        cleaned_two_grams = []
        correction_words = []
        for two_gram in dirty_two_grams:
            cleaned_word1, cleaned_word2 = clean_word(two_gram[0]), clean_word(two_gram[1])
            if cleaned_word1 is None or cleaned_word2 is None:
                continue
            if len(cleaned_word1) == 0 or len(cleaned_word2) == 0:
                continue
            cleaned_two_grams.append((cleaned_word1, cleaned_word2))
            correction_words.extend(fetch_correction_words(two_gram[0]))
            correction_words.extend(fetch_correction_words(two_gram[1]))
    return cleaned_two_grams, correction_words

In [3]:
one_grams_counter = Counter(read_one_grams())

In [4]:
two_grams, correction_words = read_two_grams_with_correction_words()
two_grams_counter = Counter(two_grams)

In [112]:
words_base_forms = read_words_base_forms()
words = set(words_base_forms.keys())
base_forms = set(itertools.chain(*words_base_forms.values()))

In [113]:
def find_merged_words():
    merged_words = []
    for one_gram in set(one_grams_counter.keys()).union(correction_words):
        for split_index in range(1, len(one_gram)):
            word1, word2 = one_gram[:split_index], one_gram[split_index:]
            if word1 not in words or word2 not in words:
                continue
            one_gram_count = one_grams_counter[one_gram]
            if len(word1) <= 3 or len(word2) <= 3:
                continue
            if two_grams_counter[(word1, word2)] >= one_gram_count * 0.5 and one_gram_count > 0:
                merged_words.append((word1, word2))
                continue
            count1, count2 = one_grams_counter[word1], one_grams_counter[word2]
            if min(count1, count2) <= 20:
                continue
            count_diff = abs(count1 - count2)         
            if count_diff <= min(count1, count2) * 0.3 and sum([count1, count2]) * 0.01 >= one_gram_count:
                merged_words.append((word1, word2))                
    return merged_words

In [114]:
merged_words = find_merged_words()
len(merged_words)

7246

In [117]:
EXPECTED_MERGED_WORDS = [
    ("wielkiego", "pomorska"),
    ("socjologii", "uniwersytetu"),
    ("otwarta", "pracownia"),
    ("przez", "grzechy"),
]
[x in merged_words for x in EXPECTED_MERGED_WORDS]

[True, True, True, True]

In [118]:
UNEXPECTED_MERGED_WORDS = [
    ("anty", "systemową"),
    ("super", "tygrysa"),
    ("wewnątrz", "oddziałowego"),
    ("wschodnio", "karpackiego"),
]
[x not in merged_words for x in UNEXPECTED_MERGED_WORDS]

[True, True, True, True]

In [119]:
one_grams_counter['agro'], one_grams_counter['grzechy'], two_grams_counter[('glaxo', 'smith')], one_grams_counter['przezgrzechy']

(146, 39, 3, 2)

In [120]:
"agro" in words_base_forms.keys()

False

In [131]:
[merged_words[i] for i in np.random.choice(len(merged_words), 10, replace=False)]

[('mama', 'tata'),
 ('ponad', 'stuosobowy'),
 ('spam', 'killer'),
 ('biuro', 'literackie'),
 ('propagandowy', 'gest'),
 ('pycha', 'tego'),
 ('nike', 'owcy'),
 ('jest', 'patrze'),
 ('kamer', 'gier'),
 ('prze', 'trwaj')]

In [146]:
def find_split_words():
    split_words = []
    for word1, word2 in two_grams_counter.keys():
        merged_word = word1 + word2
        one_gram_count = one_grams_counter[merged_word]
        if len(word1) <= 3 or len(word2) <= 3:
            continue
        if two_grams_counter[(word1, word2)] < one_gram_count * 0.4 and one_gram_count > 0:
            split_words.append(merged_word)
    return split_words

In [147]:
split_words = find_split_words()
len(split_words)

5500

In [148]:
[split_words[i] for i in np.random.choice(len(split_words), 10, replace=False)]

['egotycznie',
 'freiburg',
 'zakupowych',
 'nowoczesne',
 'powodowi',
 'cudzoziemskiej',
 'produkowano',
 'sprawiedliwo',
 'wysokospecjalistyczna',
 'hitlerowskie']