In [6]:
import openai
import time
from dotenv import load_dotenv
import os
import xml.etree.ElementTree as ET
import nltk
import datetime
import random
from nltk.corpus import wordnet as wn
from estnltk.wordnet import Wordnet as EstWordnet
import estnltk as et
nltk.download('wordnet')
estwn = EstWordnet()

[nltk_data] Downloading package wordnet to /Users/erudi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
load_dotenv()

openai.api_type = os.getenv("API_TYPE")
openai.api_key = os.getenv("API_KEY")
openai.api_base = os.getenv("API_BASE")
openai.api_version = os.getenv("API_VERSION")

In [8]:
WORD_ID = 1


def get_initial_prompt_xml(is_est=False):
    return f"""You are a highly skilled AI trained in language comprehension and WordNet generation. You will be given a word and you have to give all defenitions of the word and give an example.
            {"You will be given the word in Estonian. Meaning and example must be in Estonian." if is_est else ""}
            The output must contain only XML formatted answer. The XML must look like this:
            <definitions>
                <definition>
                    <word>[Given word]</word>
                    <type>[adjectives/adverbs/conjunctions/determiners/nouns/prepositions/pronouns/verbs]</type>
                    <meaning>[Meaning of the word]</meaning>
                    <example>[An example sentece with given word]</example>
                </definition>
            </definitions>"""


def get_initial_prompt(is_est=False):
    return f"""You are a highly skilled AI trained in language comprehension and WordNet generation. 
            You will be given a word and you must give all defenitions of the word.
            The output must contain only plain text and must contain given word, word type, meaing of the word and and example separeted by a new line. An example of the output:
            Word: [Given word]
            Type: [adjectives/adverbs/conjunctions/determiners/nouns/prepositions/pronouns/verbs]
            Meaning: [Meaning of the word]
            Example: [An example sentece with given word]
            Do not include any other information."""
    # return f"""You are a highly skilled AI trained in language comprehension and WordNet generation.
    #         You will be given a word and you have to give all defenitions that are in Python NLTK library's WordNet. DO NOT INCLUDE people and places.
    #         {"You will be given the word in Estonian. Meaning and example must be in Estonian." if is_est else ""}
    #         The output must contain only plain text and must contain given word, word type, meaing of the word and and example separeted by a new line. An example of the output:
    #         [Given word]
    #         [adjectives/adverbs/conjunctions/determiners/nouns/prepositions/pronouns/verbs]
    #         [Meaning of the word]
    #         [An example sentece with given word]
    #         Do not include any other information in the output even "Meaning" and "Example" words."""


def get_initial_prompt_est():
    return f""" Sa oled kõrgelt kvalifitseeritud keele mõistmise ja WordNeti genereerimise AI.
                Sulle antakse sõna ja sa pead andma kõik selle sõna definitsioonid.
                Väljund peab sisaldama ainult tavalist teksti ja peab sisaldama, uue reaga eraldatud: antud sõna, sõna tüüpi, sõna tähendust ja näide. Väljundi näide:
                Sõna: [Antud sõna]
                Tüüp: [omadussõna/abiverb/sidesõna/määrsõna/asesõna/nimisõna/palind/tegusõna]
                Tähendus: [Sõna tähendus]
                Näide: [Näide lause antud sõnaga]
                Ära lisa väljundisse muud informatsiooni."""


def get_prompt_str_xml(word):
    return f"""Now you will be given the following fields: id, word, type, meaning and example. 
    You will have to give exact {word}s {"that are in WordNet in Python NLTK library" if False else ""}.
    The output must contain only XML. Here is an example of what XML must look like:

            <{word}s>
                <{word}>[{word} of word 1]</{word}>
                <{word}>[{word} of word 2]</{word}>
                <{word}>[{word} of word 3]</{word}>
                ...
            </{word}s>

    The XML is just an example, there can be more or less {word}s for each word. If there are no {word}s, just leave the {word}s tag empty.
    """


def get_prompt_str(word):
    return f"""Now you will be given the following fields: word, type, meaning and example.
    You will have to give exact {word}s{" that are in WordNet in Python NLTK library" if False else ""}. 
    The output must contain only plain text. Here is an example of what the text must look like:
            [First {word}]
            [Second {word}]
            [Third {word}]
            ...
    If there are no {word}s, just leave the output empty.
    """


def get_prompt_str_est(word):
    return f"""Nüüd antakse sulle järgmised väljad: sõna, tüüp, tähendus ja näide.
    Sa pead andma täpsed {word}id, mis on Pythoni EstNLTK WordNetis.
    Väljund peab sisaldama ainult tavalist teksti. Siin on näide, kuidas tekst peab välja nägema:
            [Esimene {word}]
            [Teine {word}]
            [Kolmas {word}]
            ...
    Kui sõnu pole, jäta väljund tühjaks.
    """


relations = ['synonym', 'hyponym', 'meronym', 'antonym']
relations_est = ['sünonüüm', 'hüponüüm', 'meronüüm', 'antonüüm']

In [9]:
total_price = 0
def openai_api_calculate_cost(usage, model="gpt-4-1106-preview"): # https://community.openai.com/t/how-to-calculate-the-cost-of-a-specific-request-made-to-the-web-api-and-its-reply-in-tokens/270878/15
    global total_price
    pricing = {
        'gpt-3.5-turbo-1106': {
            'prompt': 0.001,
            'completion': 0.002,
        },
        'gpt-4-1106-preview': {
            'prompt': 0.01,
            'completion': 0.03,
        },
        'gpt-4': {
            'prompt': 0.03,
            'completion': 0.06,
        }
    }

    try:
        model_pricing = pricing[model]
    except KeyError:
        raise ValueError("Invalid model specified")

    prompt_cost = usage.prompt_tokens * model_pricing['prompt'] / 1000
    completion_cost = usage.completion_tokens * \
        model_pricing['completion'] / 1000

    total_cost = prompt_cost + completion_cost
    # round to 6 decimals
    total_cost = round(total_cost, 6)

    # print(
    #     f"\nTokens used:  {usage.prompt_tokens:,} prompt + {usage.completion_tokens:,} completion = {usage.total_tokens:,} tokens")
    # print(f"Total cost for {model}: ${total_cost:.4f}\n")
    total_price += total_cost
    return total_cost


def is_person(words):
    words_checked = [part[0].isupper()
                     for word in words for part in word.split('_')]
    return all(words_checked)


def remove_short_words(words):
    return [word for word in words if len(word) > 2]


# https://stackoverflow.com/questions/53416780/how-to-convert-token-list-into-wordnet-lemma-list-using-nltk
def convert_to_lemma(sentence):
    lemmatizer = nltk.WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in sentence]
    lemmas = []
    for token in text:
        try:
            lemmas += [synset.lemmas()[0].name()
                       for synset in wn.synsets(token)]
        except:
            lemmas += [token]
    return set(lemmas)


def convert_to_lemma_est(sentence_ls):
    sentence = ' '.join(sentence_ls)
    text = et.Text(sentence)
    lemmas_layer = text.tag_layer().morph_analysis.lemma
    lemmas = [word for lemmas_list in lemmas_layer for word in lemmas_list]
    return set(lemmas)


def find_wordnet_synset(word, definition):
    overlap = 0
    most_overlap_synset = None
    parsed_def = remove_short_words(definition.lower().split())
    parsed_def_lemma = convert_to_lemma(parsed_def)
    for i, synset in enumerate(wn.synsets(word)):
        if is_person(synset.lemma_names()):
            continue
        actual_def = remove_short_words(synset.definition().lower().split())
        actual_def_lemma = convert_to_lemma(actual_def)
        overlap_temp = len(set(actual_def).intersection(set(parsed_def)))
        overlap_temp_lemma = len(
            actual_def_lemma.intersection(parsed_def_lemma))
        overlap_temp += overlap_temp_lemma
        if overlap_temp > overlap:
            overlap = overlap_temp
            most_overlap_synset = synset
    return most_overlap_synset


def get_word_synset(synset, syn_type):
    match syn_type:
        case 'synonym':
            return synset.lemmas()
        case 'hyponym':
            return synset.hyponyms()
        case 'meronym':
            return synset.part_meronyms()
        case 'antonym':
            return synset.lemmas()[0].antonyms()
        case 'hypernym':
            return synset.hypernyms()
        case 'holonyms':
            return synset.member_holonyms()
        case 'pertainyms':
            return synset.pertainyms()
        case _:
            raise ValueError(f"Unknown syn_type: {syn_type}")


def find_wordnet_synset_est(word, definition):
    overlap = -1
    most_overlap_synset = None
    parsed_def = remove_short_words(definition.lower().split())
    parsed_def_lemma = convert_to_lemma_est(parsed_def)
    for i, synset in enumerate(estwn[word]):
        actual_def = remove_short_words(synset.definition.lower().split())
        actual_def_lemma = convert_to_lemma_est(actual_def)
        overlap_temp = len(set(actual_def).intersection(set(parsed_def)))
        overlap_temp_lemma = len(
            actual_def_lemma.intersection(parsed_def_lemma))
        overlap_temp += overlap_temp_lemma
        if overlap_temp > overlap:
            overlap = overlap_temp
            most_overlap_synset = synset
    return most_overlap_synset


def get_word_synset_est(synset, syn_type):
    match syn_type:
        case 'synonym':
            return synset.lemmas
        case 'hyponym':
            return synset.hyponyms
        case 'meronym':
            return synset.meronyms
        case 'antonym':
            return synset.get_related_synset('antonym')
        case 'hypernym':
            return synset.hypernyms
        case 'holonyms':
            return synset.holonyms
        case _:
            raise ValueError(f"Unknown syn_type: {syn_type}")

In [10]:
def gen_from_prompt(msg, tags):
    completion = None
    try:
        completion = openai.ChatCompletion.create(deployment_id="gec", model="gpt-4-1106-preview", messages=msg)
        answer = completion["choices"][0]["message"]["content"]
        # print(completion)
        # if check_XML_validity(answer) and check_tags_XML(answer, tags):
        #     break

    except openai.error.ServiceUnavailableError:
        pass
        # Happens sometimes, just asking again usually helps

    except openai.error.APIError:
        pass
        # Haven't looked, why does it happen, asking again helps usually
    except KeyError as e:
        if completion["choices"][0]["finish_reason"] == "content_filter":
            pass
            # Some filter, happens even when nothing is wrong with the input, asking again might help

    except openai.error.InvalidRequestError:
        pass
        # Aslo something related to input text

    except openai.error.RateLimitError:
        time.sleep(3)
        # The error message said, that it's better to wait three seconds and try again
    if completion is not None:
        openai_api_calculate_cost(completion["usage"])
    return completion["choices"][0]["message"]

def check_XML_validity(xml_str):
    try:
        ET.fromstring(xml_str)
        return True
    except ET.ParseError:
        return False
    
def check_tags_XML(xml_str, tags):
    try:
        for k, v in tags.items():
            for elem in ET.fromstring(xml_str).iter(k):
                for tag in v:
                    if elem.find(tag) is None:
                        return False
        return True
    except ET.ParseError:
        return False


In [8]:
def main_xml(prompt="crane", is_est=False):
    global WORD_ID

    messages = [
                {"role": "system", "content":  get_initial_prompt_xml(is_est)},
                {"role": "user", "content": prompt},
            ]
    
    answer = gen_from_prompt(messages, {'definition' : ['word', 'type', 'meaning', 'example']})
    
    messages.append(dict(answer))
    
    # messages.append({"role": "system", "content": WORDNET_PROMPT})
    out_synsets = []
    # out_synsets.append(answer["content"])
    try:
        for elem in ET.fromstring(answer['content']).findall('definition'):
            prompt = f"""ID: {WORD_ID},
                Word: {elem.find("word").text},
                Type: {elem.find("type").text},
                Meaning: {elem.find("meaning").text},
                Example: {elem.find("example").text}"""
                
            out_str = f"""<synset id="{WORD_ID}" word="{elem.find("word").text}" type="{elem.find("type").text}">
                <generated>
                <meaning>{elem.find("meaning").text}</meaning>
                <example>{elem.find("example").text}</example>
                """
            for relation in relations:
                temp_list = messages.copy()
                temp_list.append({"role": "system", "content": get_prompt_str_xml(relation)})
                
                temp_list.append({"role": "user", "content": prompt})
                answer = gen_from_prompt(temp_list, {'synset' :['meaning', 'example', 'synonyms']})
                out_str += answer["content"]
                # out_synsets.append(answer["content"])
            if is_est:
                wn_synset = find_wordnet_synset_est(elem.find("word").text, elem.find("meaning").text)
            else:
                wn_synset = find_wordnet_synset(elem.find("word").text, elem.find("meaning").text)
            
            out_str += f"""
            </generated>
            <actual>
            <wn_name>{wn_synset.name if is_est else wn_synset.name()}</wn_name>
            <meaning>{wn_synset.definition if is_est else wn_synset.definition()}</meaning>"""
            for relation in relations:
                try:
                    synset = get_word_synset_est(wn_synset, relation) if is_est else get_word_synset(wn_synset, relation)
                except:
                    continue
                out_str += f"""
                <{relation}s>
                {synset}
                </{relation}s>"""
            out_str += f"""
            </actual>
            </synset>
            """
            # out_str += f"""
            # </generated>
            # </synset>"""
            out_synsets.append(out_str)
            WORD_ID += 1
    except Exception as e:
        print("Error:", e)
        print("DEBUG: answer:", elem.find("word").text, elem.find("meaning").text)
        return answer
    return out_synsets

In [138]:
# input_file = 'test.txt'
input_file = 'test_est.txt'
cur_time = datetime.datetime.now()
if input_file is not None:
    with open(input_file, 'r') as in_fp, open(f'{cur_time}_broken.xml', 'w') as broken_fp:
        root = ET.Element('synsets')
        for line in in_fp.readlines():
            print(line.strip())
            synsets = main_xml(prompt=line.strip(), is_est=True)
            for synset in synsets:
                try:
                    root.append(ET.fromstring(synset))
                    # ET.ElementTree(root).write(f'{cur_time}_output.xml')
                except ET.ParseError:
                    broken_fp.write('BROKEN WORD: ' + line.strip() + "\n")
                    if synset is not None:
                        broken_fp.write(synset + "\n")
        ET.ElementTree(root).write(f'{cur_time}_output.xml', encoding="UTF-8")
    

keel
tee
lill
pall
tihane


In [19]:
def main(input_file='test.txt', is_est=False, cur_time=None):
    WORD_ID = 1
    if cur_time is None:
        cur_time = datetime.datetime.now()
    with open(input_file, 'r') as in_fp, open(f'{cur_time}_broken.xml', 'w') as broken_fp:
        root = ET.Element('synsets')
        for line in in_fp.readlines():
            print(line.strip())
            messages = [
                # {"role": "system", "content":  get_initial_prompt(is_est)},
                {"role": "system", "content":  get_initial_prompt_est()},
                {"role": "user", "content": line.strip()},
            ]
            check = False
            for _ in range(3):
                answer = gen_from_prompt(messages, None)
                if answer is not None and 'content' in answer:
                    answer_ls = [el.split(':')[-1].strip() for el in answer['content'].split('\n') if len(el) > 0]
                    if len(answer_ls) % 4 == 0:
                        check = True
                        break
            if not check:
                broken_fp.write('BROKEN WORD: ' + line.strip() + "\n")
                continue
            print("GEN: meanings, size:", len(answer_ls)//4, answer_ls)
            messages.append(dict(answer))
            # answer_ls = answer['content'].split('\n')
            for i in range(len(answer_ls)//4):
                print(f"GEN: {i+1}th word: {answer_ls[i*4]}")
                xml_str = f"""<synset id="{WORD_ID}" word="{answer_ls[i*4]}" type="{answer_ls[i*4+1]}">
                    <generated>
                    <meaning>{answer_ls[i*4+2]}</meaning>
                    <example>{answer_ls[i*4+3]}</example>
                    """
                gen_rel_dict = {}
                for relation in relations:
                    temp_list = messages.copy()
                    # temp_list.append({"role": "system", "content": get_prompt_str(relation)})
                    temp_list.append({"role": "system", "content": get_prompt_str_est(relation)})
                    prompt = f"""Word: {answer_ls[i*4]},
                        Type: {answer_ls[i*4+1]},
                        Meaning: {answer_ls[i*4+2]},
                        Example: {answer_ls[i*4+3]}"""
                    temp_list.append({"role": "user", "content": prompt})
                    check = False
                    for _ in range(3):
                        rel_answer = gen_from_prompt(temp_list, None)
                        if rel_answer is not None and 'content' in  rel_answer:
                            check = True
                            break
                    if not check:
                        gen_rel_dict[relation] = []
                        continue
                    rel_answer_ls = rel_answer['content'].split('\n')
                    rel_answer_ls = [rel.strip().lower().replace(' ', '_') for rel in rel_answer_ls]
                    gen_rel_dict[relation] = rel_answer_ls
                    xml_str += f"""<{relation}s>{rel_answer_ls}</{relation}s>"""
                xml_str += "</generated>"
                if is_est:
                    wn_synset = find_wordnet_synset_est(answer_ls[i*4], answer_ls[i*4+2])
                else:
                    wn_synset = find_wordnet_synset(answer_ls[i*4], answer_ls[i*4+2])
                actual_rel_dict = dict.fromkeys(relations, [])
                if wn_synset is None:
                    xml_str += f"""
                    <actual>NONE</actual>
                    <stats>
                    """
                else:
                    xml_str += f"""
                    <actual>
                    <wn_name>{wn_synset.name if is_est else wn_synset.name()}</wn_name>
                    <meaning>{wn_synset.definition if is_est else wn_synset.definition()}</meaning>"""
                    for relation in relations:
                        try:
                            synset = get_word_synset_est(wn_synset, relation) if is_est else get_word_synset(wn_synset, relation)
                        except:
                            synset = []
                        if is_est:
                            if relation == 'synonym':
                                synset = [s.lower() for s in synset]
                            else:
                                synset = [s.name.lower().split('.')[0] for s in synset]
                        else:
                            synset = [s.name().lower().split('.')[0] for s in synset]
                        actual_rel_dict[relation] = synset
                        
                        xml_str += f"""
                        <{relation}s>
                        {synset}
                        </{relation}s>"""
                    xml_str += f"""
                    </actual>
                    <stats>
                    """
                total_gen = 0
                total_actual = 0
                total_overlapping = 0
                total_over_generated = 0
                total_under_generated = 0
                for relation in relations:
                    gen_rel_set = set(gen_rel_dict[relation])
                    actual_rel_set = set(actual_rel_dict[relation])
                    cur_total_gen = len(gen_rel_dict[relation])
                    cur_total_actual = len(actual_rel_dict[relation])
                    cur_total_overlapping = len(gen_rel_set.intersection(actual_rel_set))
                    cur_total_over_generated = len(gen_rel_set.difference(actual_rel_set))
                    cur_total_under_generated = len(actual_rel_set.difference(gen_rel_set))
                    total_actual += cur_total_actual
                    total_gen += cur_total_gen
                    total_overlapping += cur_total_overlapping
                    total_over_generated += cur_total_over_generated
                    total_under_generated += cur_total_under_generated
                    xml_str += f"""
                    <{relation}>
                    <generated_size>{cur_total_gen}</generated_size>
                    <actual_size>{cur_total_actual}</actual_size>
                    <overlapping>{cur_total_overlapping}</overlapping>
                    <over_generated>{cur_total_over_generated}</over_generated>
                    <under_generated>{cur_total_under_generated}</under_generated>
                    </{relation}>"""  
                xml_str += f"""
                <total>
                <generated_size>{total_gen}</generated_size>
                <actual_size>{total_actual}</actual_size>
                <overlapping>{total_overlapping}</overlapping>
                <over_generated>{total_over_generated}</over_generated>
                <under_generated>{total_under_generated}</under_generated>
                </total>
                """   
                xml_str += f"""
                </stats>
                </synset>
                """
                try:
                    root.append(ET.fromstring(xml_str))
                except ET.ParseError:
                    broken_fp.write('BROKEN WORD: ' + line.strip() + "\n")
                    broken_fp.write(xml_str + "\n")
                WORD_ID += 1
            
            ET.ElementTree(root).write(f'{cur_time}_output.xml', encoding="UTF-8")
        return root

In [20]:
is_test = False
cur_time = datetime.datetime.now()
file_name = f'{cur_time}_random_words.txt'
if not is_test:
    rand_lines_nr = 5
    with open('words.txt') as fp:
    # with open('lemmad.txt') as fp:
        rand_lines = random.sample(list(fp), rand_lines_nr)
    with open(file_name, 'w') as fp:
        fp.writelines(rand_lines)
r = main(input_file=file_name, cur_time=cur_time, is_est=False)

diseased

Tokens used:  243 prompt + 70 completion = 313 tokens
Total cost for gpt-4-1106-preview: $0.0045

GEN: meanings, size: 1 ['diseased', 'omadussõna', 'Haigustest mõjutatud või haige; mittetervislik', 'Arstid uurisid diseased organit, et mõista haiguse ulatust.']
GEN: 1th word: diseased

Tokens used:  521 prompt + 27 completion = 548 tokens
Total cost for gpt-4-1106-preview: $0.0060


Tokens used:  529 prompt + 40 completion = 569 tokens
Total cost for gpt-4-1106-preview: $0.0065


Tokens used:  525 prompt + 0 completion = 525 tokens
Total cost for gpt-4-1106-preview: $0.0053


Tokens used:  525 prompt + 7 completion = 532 tokens
Total cost for gpt-4-1106-preview: $0.0055

hefa

Tokens used:  242 prompt + 56 completion = 298 tokens
Total cost for gpt-4-1106-preview: $0.0041

GEN: meanings, size: 1 ['hefa', 'nimisõna', 'heebrea kirjanduses esinev naisnime vorm.', 'Ta nimetas oma tütart nimega Hefa.']
GEN: 1th word: hefa

Tokens used:  492 prompt + 0 completion = 492 tokens
Total 

In [21]:
print(total_price)

0.46566


In [57]:
def test_accuracy(input_file):
    with open(input_file, 'r') as fp:
        xml_file = ET.parse(fp)
        root = xml_file.getroot()
        # Get generated and actual synsets and compare relations by calculating the overlap
        for synset in root.findall('synset'):
            generated = synset.find('generated')
            actual = synset.find('actual')
            print(f"Word: {synset.get('word')}, Type: {synset.get('type')}")
            print(
                f"Generated: {generated.find('meaning').text}, Actual: {actual.find('meaning').text}")
            for relation in relations:
                gen_rel = generated.find(f"{relation}s")
                act_rel = actual.find(f"{relation}s")
                if gen_rel is None and act_rel is None:
                    continue
                if gen_rel is None or act_rel is None:
                    print(
                        f"Relation: {relation}, Gen: {gen_rel}, Act: {act_rel}")
                    continue
                gen_rel = set(gen_rel.text.split())
                act_rel = set(act_rel.text.split())
                print(f"Relation: {relation}, Gen: {gen_rel}, Act: {act_rel}")
                print(
                    f"Overlap: {len(gen_rel.intersection(act_rel))}, Gen: {len(gen_rel)}, Act: {len(act_rel)}")


def count_total_stats(input_file):
    with open(input_file, 'r') as fp:
        xml_file = ET.parse(fp)
        root = xml_file.getroot()
        total_gen = 0
        total_actual = 0
        total_overlapping = 0
        total_over_generated = 0
        total_under_generated = 0
        for synset in root.findall('synset'):
            stats = synset.find('stats')
            total_stats = stats.find('total')
            total_gen += int(total_stats.find('generated_size').text)
            total_actual += int(total_stats.find('actual_size').text)
            total_overlapping += int(total_stats.find('overlapping').text)
            total_over_generated += int(total_stats.find('over_generated').text)
            total_under_generated += int(
                total_stats.find('under_generated').text)
        print(f"Total: Gen: {total_gen}, Act: {total_actual}, Overlap: {total_overlapping}, Over Gen: {total_over_generated}, Under Gen: {total_under_generated}")

In [58]:
count_total_stats(f'{cur_time}_output.xml')

Total: Gen: 346, Act: 27, Overlap: 9, Over Gen: 337, Under Gen: 18


In [10]:
def get_word_synset(synset, syn_type):
    match syn_type:
        case 'synonym':
            return synset.lemmas()
        case 'hyponym':
            return synset.hyponyms()
        case 'meronym':
            return synset.part_meronyms()
        case 'antonym':
            return synset.lemmas()[0].antonyms()
        case 'hypernym':
            return synset.hypernyms()
        case 'holonyms':
            return synset.member_holonyms()
        case 'pertainyms':
            return synset.pertainyms()
        case _:
            raise ValueError(f"Unknown syn_type: {syn_type}")

def get_word_synset_est(synset, syn_type):
    match syn_type:
        case 'synonym':
            return synset.lemmas
        case 'hyponym':
            return synset.hyponyms
        case 'meronym':
            return synset.meronyms
        case 'antonym':
            return synset.get_related_synset('antonym')
        case 'hypernym':
            return synset.hypernyms
        case 'holonyms':
            return synset.holonyms
        case _:
            raise ValueError(f"Unknown syn_type: {syn_type}")

In [21]:
for i in estwn['kapsas']:
    print(i)
    print(i.name)
    print(i.definition)
    print(i.lemmas)
    print(i.hyponyms)
    print()

Synset('kapsas.n.01')
kapsas.n.01
piltl kulunud, narmendav, katkine raamat, kaustik, vihik vms (EKSS)
['kapsas']
["Synset('raamatukapsas.n.01')"]

Synset('kapsapea.n.01')
kapsapea.n.01
saadus kapsa taimest
['kapsapea', 'kapsas']
["Synset('punane kapsas.n.01')", "Synset('riivkapsas.n.01')"]

Synset('kapsas.n.03')
kapsas.n.03
ristõieline köögivilja- ja söödakultuur (hrl. Brassica oleracea) (EKSS)
['kapsas', 'kapsataim']
["Synset('lillkapsas.n.02')", "Synset('asparkapsas.n.01')", "Synset('peakapsas.n.02')", "Synset('lehtkapsas.n.01')", "Synset('brüsseli kapsas.n.02')", "Synset('käharkapsas.n.01')", "Synset('hiina kapsas.n.01')", "Synset('koolrabi.n.01')"]



In [41]:
temp = estwn['kapsas']
temp[2].get_related_synset('synonym')

[]

In [26]:
wn.synsets('current')

[Synset('current.n.01'),
 Synset('current.n.02'),
 Synset('stream.n.02'),
 Synset('current.a.01')]

In [44]:
wn.synsets('current')[1].lemma_names()

['current', 'stream']

In [131]:
print(find_wordnet_synset_est('keel', 'Inimene kasutab suuõõnes asuvat keelt kõnelemiseks ja toidu maitse tundmiseks.'))

Synset('keel.n.01')


In [117]:
estwn['lill']

["Synset('lill.n.01')"]

In [10]:
test = find_wordnet_synset('crane', 'A large machine for moving heavy objects by suspending them from a beam.')

Synset('crane.n.04')

In [26]:
test.lemma_names()

['crane']

In [8]:
for i, s in enumerate(wn.synsets('cat')):
    # if is_person(s.lemma_names()):
    #     continue
    print(i)
    print(s)
    print(s.definition())
    print(s.lemma_names())
    print(s.name())
    print(s.lemmas())
    
    

0
Synset('cat.n.01')
feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats
['cat', 'true_cat']
cat.n.01
[Lemma('cat.n.01.cat'), Lemma('cat.n.01.true_cat')]
1
Synset('guy.n.01')
an informal term for a youth or man
['guy', 'cat', 'hombre', 'bozo']
guy.n.01
[Lemma('guy.n.01.guy'), Lemma('guy.n.01.cat'), Lemma('guy.n.01.hombre'), Lemma('guy.n.01.bozo')]
2
Synset('cat.n.03')
a spiteful woman gossip
['cat']
cat.n.03
[Lemma('cat.n.03.cat')]
3
Synset('kat.n.01')
the leaves of the shrub Catha edulis which are chewed like tobacco or used to make tea; has the effect of a euphoric stimulant
['kat', 'khat', 'qat', 'quat', 'cat', 'Arabian_tea', 'African_tea']
kat.n.01
[Lemma('kat.n.01.kat'), Lemma('kat.n.01.khat'), Lemma('kat.n.01.qat'), Lemma('kat.n.01.quat'), Lemma('kat.n.01.cat'), Lemma('kat.n.01.Arabian_tea'), Lemma('kat.n.01.African_tea')]
4
Synset('cat-o'-nine-tails.n.01')
a whip with nine knotted cords
["cat-o'-nine-tails", 'cat']
cat-o'-nine-tails.n.01
[

In [112]:
wn.synonyms('crane')

[['Crane', 'Stephen_Crane'],
 ['Crane', 'Harold_Hart_Crane', 'Hart_Crane'],
 ['Crane', 'Grus'],
 [],
 [],
 ['stretch_out']]