In [21]:
from nltk.corpus import wordnet, stopwords, genesis
import numpy as np
from itertools import combinations
from simple_colors import *

genesis_ic = None
stop_words = None
try:
    _ = stopwords.words('english')
    _ = wordnet.synsets('dog')
except LookupError as e:
    print(e)
    import nltk
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('genesis')
finally:
    stop_words = set(stopwords.words('english'))
    genesis_ic = wordnet.ic(genesis, False, 0.0)

In [22]:
sentence = "There were numerous horses on the racecourse in Paris"
splitted_sentence = sentence.split()
meaningful_words = [word for word in splitted_sentence if word.lower() not in stop_words]
print(meaningful_words)

['numerous', 'horses', 'racecourse', 'Paris']


In [23]:
positions = list(np.cumsum(list(map(len, splitted_sentence))))
for idx in range(len(positions)):
    positions[idx] += idx
words_idx = []
for left_pos, right_pos in zip([0] + positions[:-1], positions):
    words_idx.append((left_pos, right_pos))
print(words_idx)

[(0, 5), (5, 10), (10, 19), (19, 26), (26, 29), (29, 33), (33, 44), (44, 47), (47, 53)]


In [24]:
searched_words_idx = [words_idx[splitted_sentence.index(word)] for word in meaningful_words]
words = [sentence[max(left+1, 0):right] for left, right in searched_words_idx]
print(words)

['numerous', 'horses', 'racecourse', 'Paris']


In [25]:
words_synsets = [set(wordnet.synsets(word)) for word in words]
print(words_synsets)

[{Synset('numerous.s.01')}, {Synset('knight.n.02'), Synset('horse.v.01'), Synset('horse.n.02'), Synset('cavalry.n.01'), Synset('horse.n.01'), Synset('sawhorse.n.01')}, {Synset('racetrack.n.01')}, {Synset('paris.n.03'), Synset('paris.n.02'), Synset('paris.n.04'), Synset('paris.n.01')}]


In [26]:
def convert_to_noun_synsets(word_synsets):
    result_synsets = set()
    for synset in word_synsets:
        if synset.pos() == 'n':
            result_synsets.add(synset)
        else:
            possible_words = convert(synset, 'n')
            for p_word, p_odds in possible_words:
                if p_word in meaningful_words:
                    continue
                result_synsets |= set(wordnet.synsets(p_word, pos=wordnet.NOUN))
    return result_synsets

In [27]:
# Just to make it a bit more readable
WN_NOUN = 'n'
WN_VERB = 'v'
WN_ADJECTIVE = 'a'
WN_ADJECTIVE_SATELLITE = 's'
WN_ADVERB = 'r'


def convert(synset, to_pos):
    """ Transform words given from/to POS tags """

    from_pos = synset.pos()
    # Get all lemmas of the word (consider 'a'and 's' equivalent)
    lemmas = []
    for l in synset.lemmas():
        if synset.name().split('.')[1] == from_pos or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and synset.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
            lemmas += [l]

    # Get related forms
    derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]

    # filter only the desired pos (consider 'a' and 's' equivalent)
    related_noun_lemmas = []

    for drf in derivationally_related_forms:
        for l in drf[1]:
            if l.synset().name().split('.')[1] == to_pos or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                related_noun_lemmas += [l]

    # Extract the words from the lemmas
    words = [l.name() for l in related_noun_lemmas]
    len_words = len(words)

    # Build the result in the form of a list containing tuples (word, probability)
    result = [(w, float(words.count(w)) / len_words) for w in set(words)]
    result.sort(key=lambda w:-w[1])

    # return all the possibilities sorted by probability
    return result

In [28]:
new_synsets = [convert_to_noun_synsets(word_syn) for word_syn in words_synsets]
for synsets in new_synsets:
    for synset in synsets:
        print(synset.name(), synset.definition())

numerousness.n.01 a large number
knight.n.02 a chessman shaped to resemble the head of a horse; can move two squares horizontally and one vertically (or vice versa)
sawhorse.n.01 a framework for holding wood that is being sawed
cavalry.n.01 troops trained to fight on horseback
horse.n.02 a padded gymnastic apparatus on legs
horse.n.01 solid-hoofed herbivorous quadruped domesticated since prehistoric times
racetrack.n.01 a course over which races are run
paris.n.03 (Greek mythology) the prince of Troy who abducted Helen from her husband Menelaus and provoked the Trojan War
paris.n.02 sometimes placed in subfamily Trilliaceae
paris.n.04 a town in northeastern Texas
paris.n.01 the capital and largest city of France; and international center of culture and commerce


In [29]:
def find_best_composition(synsets: list, currently_checked_synsets: list, index: int):
    a = [float('inf'), None]
    if index >= len(synsets):
        return calculate_distance(currently_checked_synsets)
    for word in synsets[index]:
        similarity, best_synsets = find_best_composition(synsets, [word] + currently_checked_synsets, index + 1)
        if similarity < a[0]:
            a = (similarity, best_synsets)
    return a

def calculate_distance(synsets: list, similarity_function = wordnet.lch_similarity):
    possible_combinations = combinations(synsets, 2)
    similarity = list(map(lambda x: similarity_function(*x, genesis_ic), possible_combinations))
    similarity = sum(similarity)
    return similarity, synsets

In [30]:
odds, best_synsets = find_best_composition(new_synsets, list(), 0)

In [31]:
def color(word):
    if word in meaningful_words:
        return "\033[1m" + red(word) + "\033[0m"
    else:
        return word

In [32]:
print(' '.join([color(word) for word in splitted_sentence]))

for w, s in zip(meaningful_words, best_synsets[::-1]):
    print(f"{w}: {s.definition()}")

There were [1m[31mnumerous[0m[0m [1m[31mhorses[0m[0m on the [1m[31mracecourse[0m[0m in [1m[31mParis[0m[0m
numerous: a large number
horses: solid-hoofed herbivorous quadruped domesticated since prehistoric times
racecourse: a course over which races are run
Paris: (Greek mythology) the prince of Troy who abducted Helen from her husband Menelaus and provoked the Trojan War


In [33]:
from nltk.wsd import lesk

In [34]:
for w in meaningful_words:
    print(lesk(meaningful_words, w).definition())

amounting to a large indefinite number
provide with a horse or horses
a course over which races are run
a town in northeastern Texas
