In [1]:
import nltk.tokenize
import string
import itertools
import unicodedata
from collections import defaultdict, namedtuple
import re
import numpy as np
import scipy.spatial.distance
import sys

In [2]:
sys.path.append("article_finder")
import articles
import embeddings

## Reading embeddings and base forms

In [3]:
def read_words_embeddings():
    with open("article_finder/data/wiki_embeddings_300.txt") as file_stream:
        lines = [x.strip() for x in file_stream.readlines()]
        return {x.split()[0].lower(): np.fromstring(" ".join(x.split()[1:]), dtype=float, sep=" ") for x in lines[1:]}
    

words_embeddings = read_words_embeddings()

In [4]:
words_base_forms = embeddings.read_words_base_forms(path="article_finder/data/base_forms.txt")

## Synonyms

In [60]:
def create_regex_synonyms_candidates(wiki_articles):
    synonims_candidates = []
    for wiki_article in wiki_articles:
        for regex_match in re.findall("„([a-z]+)” , „([a-z]+)”", wiki_article.content):
            synonims_candidates.append((regex_match[0], regex_match[1]))
    return synonims_candidates


def create_regex_synonyms_extended_candidates(wiki_articles):
    synonims_candidates = []
    for wiki_article in wiki_articles:
        for regex_match in re.findall("`` ([a-z]+) '' , `` ([a-z]+) ''", wiki_article.content):
            synonims_candidates.append((regex_match[0], regex_match[1]))
    return synonims_candidates

In [56]:
def is_one_word_synonym(text1, text2):
    return len(text1.split(" ")) == 1 and len(text2.split(" ")) == 1


def get_close_base_forms(word1, word2):
    if len(word1) <= 2 or len(word2) <= 2:
        return None
    base_forms1 = words_base_forms[word1]
    base_forms2 = words_base_forms[word2]
    for base_form1 in base_forms1:
        for base_form2 in base_forms2:
            if base_form1 == base_form2:
                break            
            if base_form1 not in words_embeddings or base_form2 not in words_embeddings:
                continue
            embeddings1 = words_embeddings[base_form1]
            embeddings2 = words_embeddings[base_form2]
            if scipy.spatial.distance.cosine(embeddings1, embeddings2) <= 0.75:
                return (base_form1, base_form2)
    return None

In [57]:
wiki_articles = articles.read_wiki_articles(path="article_finder/data/wiki_slice.txt")
synonym_candidates = create_regex_synonyms_candidates(wiki_articles)
one_word_synonym_candidates = [x for x in synonym_candidates if is_one_word_synonym(*x)]
len(one_word_synonym_candidates)

108

In [58]:
close_synonyms = [get_close_base_forms(*x) for x in one_word_synonym_candidates]
filtered_synonyms = {x for x in close_synonyms if x is not None}
len(filtered_synonyms)

50

In [59]:
filtered_synonyms

{('aspiracja', 'zdrowie'),
 ('bas', 'sopran'),
 ('boski', 'cudowny'),
 ('brat', 'kuzyn'),
 ('człowiek', 'lud'),
 ('człowiek', 'osoba'),
 ('developer', 'koder'),
 ('drugi', 'inny'),
 ('dziwny', 'obcy'),
 ('entuzjazm', 'pragnienie'),
 ('europarlamentarzysta', 'eurodeputowany'),
 ('inny', 'dziwny'),
 ('inteligentny', 'twardy'),
 ('krytycyzm', 'transcendentalizm'),
 ('labirynt', 'piwiarnia'),
 ('lalka', 'marionetka'),
 ('miska', 'miednica'),
 ('mocny', 'silny'),
 ('modyfikacja', 'zmiana'),
 ('niwa', 'niwka'),
 ('obcy', 'nieznajomy'),
 ('pakt', 'przymierze'),
 ('podawanie', 'przekazywać'),
 ('pompka', 'mostek'),
 ('powtarzać', 'badanie'),
 ('prawdziwy', 'prawowity'),
 ('prawo', 'ustawa'),
 ('prosty', 'powszechny'),
 ('przedmurze', 'bastion'),
 ('przy', 'obok'),
 ('przybysz', 'nowicjusz'),
 ('przyroda', 'nauka'),
 ('psyche', 'dusza'),
 ('rak', 'krab'),
 ('rozpuszczać', 'rozdzielać'),
 ('silny', 'mocny'),
 ('skoczek', 'tancerz'),
 ('somatyczny', 'duchowy'),
 ('staranie', 'zamiar'),
 ('szacune