In [143]:
import openai
import time
from dotenv import load_dotenv
import os
import xml.etree.ElementTree as ET
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rudi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [144]:
load_dotenv()

openai.api_type = os.getenv("API_TYPE")
openai.api_key = os.getenv("API_KEY")
openai.api_base = os.getenv("API_BASE")
openai.api_version = os.getenv("API_VERSION")

In [129]:
WORD_ID = 1

INIT_PROMPT = """You are a WordNet generator. At first, you will be given a word and you have to give all defenitions of the word and give an example.
The outpus contain only XML formatted answer. The XML must look like this:
<definitions>
    <definition>
        <word>[Given word]</word>
        <type>[adjectives/adverbs/conjunctions/determiners/nouns/prepositions/pronouns/verbs]</type>
        <meaning>[Meaning of the word]</meaning>
        <example>[An example sentece with given word]</example>
    </definition>
</definitions>"""

WORDNET_PROMPT = """Now you will be given the following fields: id, word, type, meaning and example. 
You will have to give WordNet for a given word. Currently, give only synonyms. 
The output contains only XML. Here is an example of what XML must look like:
<synsets>
    <synset id=[given id] word=[given word] type=[type]>
        <meaning>[given meaning]</meaning>
        <example>[given example]</example>
        <synonyms>
            <synonym>[synonym of word 1]</synonym>
            <synonym>[synonym of word 2]</synonym>
            <synonym>[synonym of word 3]</synonym>
            ...
        </synonyms>
    </synset>
</synsets>
The XML is just an example, there can be more or less synonyms for each word.
"""

In [130]:
def gen_from_prompt(msg, tags):
    for _ in range(3):
        try:
            completion = openai.ChatCompletion.create(deployment_id="gec", model="gpt-4-1106-preview", messages=msg)
            answer = completion["choices"][0]["message"]["content"]
            # print(completion)
            if check_XML_validity(answer) and check_tags_XML(answer, tags):
                break

        except openai.error.ServiceUnavailableError:
            pass
            # Happens sometimes, just asking again usually helps

        except openai.error.APIError:
            pass
            # Haven't looked, why does it happen, asking again helps usually
        except KeyError as e:
            if completion["choices"][0]["finish_reason"] == "content_filter":
                pass
                # Some filter, happens even when nothing is wrong with the input, asking again might help

        except openai.error.InvalidRequestError:
            pass
            # Aslo something related to input text

        except openai.error.RateLimitError:
            time.sleep(3)
            # The error message said, that it's better to wait three seconds and try again

    return completion["choices"][0]["message"]

def check_XML_validity(xml_str):
    try:
        ET.fromstring(xml_str)
        return True
    except ET.ParseError:
        return 
    
def check_tags_XML(xml_str, tags):
    # tags = ['word', 'type', 'meaning', 'example']
    for k, v in tags.items():
        for elem in ET.fromstring(xml_str).findall(k):
            for tag in v:
                if elem.find(tag) is None:
                    return False
    return True


In [140]:
def main(prompt="crane"):
    global WORD_ID

    messages = [
                {"role": "system", "content":  INIT_PROMPT},
                {"role": "user", "content": prompt},
            ]
    
    answer = gen_from_prompt(messages, {'defenitions' : ['word', 'type', 'meaning', 'example']})
    
    messages.append(dict(answer))
    
    messages.append({"role": "system", "content": WORDNET_PROMPT})
    # print(answer['content'])
    with open("output.xml", "w") as fp:
        for elem in ET.fromstring(answer['content']).findall('definition'):
            print(elem)
            temp_list = messages.copy()
            prompt = f"""ID: {WORD_ID},
            Word: {elem.find("word").text},
            Type: {elem.find("type").text},
            Meaning: {elem.find("meaning").text},
            Example: {elem.find("example").text}"""
            temp_list.append({"role": "user", "content": prompt})
            answer = gen_from_prompt(temp_list, {'synsets' :['synset', 'meaning', 'example', 'synonyms']})
            fp.write(answer["content"] + "\n")
            WORD_ID += 1

In [141]:
main()

<Element 'definition' at 0x0000014B9C3385E0>
<Element 'definition' at 0x0000014B9C338400>
<Element 'definition' at 0x0000014B9C33A110>


In [109]:
wn.synsets('crane')

[Synset('crane.n.01'),
 Synset('crane.n.02'),
 Synset('grus.n.01'),
 Synset('crane.n.04'),
 Synset('crane.n.05'),
 Synset('crane.v.01')]

In [124]:
for i, s in enumerate(wn.synsets('crane')):
    print(i)
    print(s)
    print(s.definition())
    print(s.lemma_names())

0
Synset('crane.n.01')
United States writer (1871-1900)
['Crane', 'Stephen_Crane']
1
Synset('crane.n.02')
United States poet (1899-1932)
['Crane', 'Hart_Crane', 'Harold_Hart_Crane']
2
Synset('grus.n.01')
a small constellation in the southern hemisphere near Phoenix
['Grus', 'Crane']
3
Synset('crane.n.04')
lifts and moves heavy objects; lifting tackle is suspended from a pivoted boom that rotates around a vertical axis
['crane']
4
Synset('crane.n.05')
large long-necked wading bird of marshes and plains in many parts of the world
['crane']
5
Synset('crane.v.01')
stretch (the neck) so as to see better
['crane', 'stretch_out']


In [112]:
wn.synonyms('crane')

[['Crane', 'Stephen_Crane'],
 ['Crane', 'Harold_Hart_Crane', 'Hart_Crane'],
 ['Crane', 'Grus'],
 [],
 [],
 ['stretch_out']]