## 01-VAI Pre-processing
In this notebook, we will filter out and does some pre-processing with VAI verbs

In [None]:
!pip install groq # groq package to connect to LLM API on groq.com

In [None]:
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# parse the json string
import json
import re

In [None]:
### First, we load the dictionary
DATA_PATH = "../../../data/"
filename = DATA_PATH + "ob_en_dict.csv"
dict_df = pd.read_csv(filename)
dict_df.head()

In [None]:
filter_tag = "vai"
df = dict_df.query("type.str.contains(@filter_tag)")
df = df.reset_index().drop(columns="index")
print(len(df))
df.head()

In [None]:
# write to a temporary file
output_filename = DATA_PATH + "vai_dict.json"
df.to_json(output_filename,
           orient="records"
          )

print("Completed")

### Using LLM to process meanings
```
Example: a definition of a word is "seek {{object}} or it; go look for {{object}} or it; search for {{object}} or it", if there are multiple meanings, split into multiple template in JSON format: {"verbs": ["seek", "look for", "search for"], "templates":["seek {{object}} or it", "go look for {{object}} or it", "search for {{object}} or it"]}. 
Output JSON format only, no explanation or discussion. Now rephrase a new definition "reach for, feel for {{object}}".
```

In [None]:
# dictionary preview
i = 200
print(df.iloc[i]["definition"])
# print(df.iloc[i]["def_normalized"])

In [None]:
import sys  
sys.path.insert(1, '../') # LLM_api.py is in the parent folder

### Groq API key 
Please go to Groq.com to sign up and get an API key, put it into `src/01_data_preprocessing/env/credentials.json` as in the following format

{"GROQ_API_KEY":"your_api_key"
}

In [None]:
import LLM_api
from LLM_api import hello, get_api_key, connect, send_request

hello("API")

In [None]:
api_key = get_api_key()
print(len(api_key))

In [None]:
llm = connect(api_key)

In [None]:
s = "JSON What is the biggest city of New York state?"
send_request(s, llm)

In [None]:
word_def = 's/he or it (animate) is carried, taken out (by someone), "they" carry, take h/ or it (animate)out'
prompt_template = """A given definition example: d =  "s/he or it (animate) is warmed at the fire (by someone), "they" warm h/ or it at the fire". Analyze the definition d. What is subject and object? Rewrite definition by replacing subject by literal `{{subject}}`.  Replace verbs to infinitive form (e.g. wants -> want, is -> be, gets -> get).Answer in form {"verbs":[], "templates":[]}. Split the definition for each main verb. Extract the main verbs only, if the sentence is in passive voice, the main verb is "be". The answer for definition d should be in JSON format 
output = {verbs:["is", "warm"],"definition":["{{subject}} be warmed at the fire (by someone)", '"they" warm {{object-intransitive}} at the fire"]}. 
Do not invent new verbs. Keep the new defnitions literally close as the original defitnition. Keep things in brackets as literal, e.g. (something) or (by someone). 

Another example:
d = "it (animate) is folded and sewn, is hemmed",
output = {"verbs":["be"], "templates":["{{subject}} be folded and sewn", "{{subject}} be hemmed"}.

Another example:
d =  "s/he smudges, censes things"
output = {'verbs': ['smudge', 'cense'], 'templates': ['{{subject}} smudge things', '{{subject}} cense things']}.

Another example:
d = "s/he smudges, censes h/ self"
output = {'verbs': ['smudge', 'cense'], 'templates': ['{{subject}} smudge oneself', '{{subject}} cense oneself']}.

Another example:
d = "s/he gets a blister or blisters on h/ foot"
output = {'verbs': ['get'], 'templates': ['{{subject}} get a blister or blisters on one's foot']}.

Another example:
d = "s/he warms (something) at the fire"
output = {'verbs': ['warm'], 'templates': ['{{subject}} warm (something) at the fire']}.

Another example:
d = 'it (animate) is dyed, colored (by someone), "they" dye, color it (animate)'
output = {'verbs': ['be', 'dye', 'color'], 'templates': ['{{subject}} is dyed, colored (by someone)', '(someone) dye {{object-intransitive}}', (someone) color {{object-intransitive}}]}.

Another example:
d = 's/he is attached by being tied on (by someone), "they" attach h/ by tying'
output = {"verbs":["be", "attach"], "templates":["{{subject}} be attached by being tied on (by someone)", '(someone) attach {{object-intransitive}} by tying']}'

Another example:
d = 's/he is put in to soak, "they" put it in to soak'
output = '{"verbs":["be", "put"], "templates":["{{subject}} be put in to soak", "(someone) put {{object-intransitive}} in to soak"]}'

Now process a new definition
"""
f'{prompt_template}: "{word_def}"'

In [None]:
print(word_def)

In [None]:
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "they wake each other up"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "s/he, it (animate) has snow blown down off h/, it"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "it (animate) is folded and sewn, is hemmed"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "s/he makes a biskitenaagan (a birch bark sap bucket); [BL] s/he makes a birch bark basket"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = "s/he is tied loosely"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = "s/he smudges, censes h/ self"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = 's/he or it (animate) is warmed at the fire (by someone), "they" warm h/ or it at the fire'
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = 'it (animate) is dyed, colored (by someone), "they" dye, color it (animate)'
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = 'it (animate) is dyed, colored (by someone), "they" dye, color it (animate)'
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = 's/he is attached by being tied on (by someone), "they" attach h/ by tying'
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try a passive example
word_def = 's/he is put in to soak, "they" put it in to soak'
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
def check_json_format(json_obj):
    """
        Check if the Json object format is valid ({"verbs" : [], "templates" : []}
    """
    result = False
    if type(json_obj) != dict:
        print("Wrong data type, expecting json dict object")
        return False

    if len(json_obj.keys()) != 2:
        print("Wrong keys, expecting 2 keys")
        return False

    if set(json_obj.keys()) != {"verbs", "templates"}:
        print(set(json_obj.keys()))
        print("Wrong keys items, expecting verbs and templates")
        return False

    if (json_obj["verbs"] is None or 
        type(json_obj["verbs"]) != list or
        len(json_obj["verbs"]) <= 0
       ):
        print("Wrong verbs, expecting at least 1 verb")
        return False
        
    if (json_obj["templates"] is None or 
        type(json_obj["templates"]) != list or
        len(json_obj["templates"]) <= 0
       ):
        print("Wrong templates, expecting at least 1 template")
        return False

    for template in json_obj["templates"]:
        # look for invalid slots such as {{subject}}, {{distance}}, etc
        r = "({{[\w|-]+}})"
        slots = re.findall(r, template)
        vai_slots = {"{{subject}}", "{{object-intransitive}}"} # slots fro VAI verbs
        if len(set(slots).difference(vai_slots)) > 0: 
            print("Wrong slots in template =", slots)
            return False
            

    # passed all condition
    result = True
    return result

assert check_json_format(dict()) == False
assert check_json_format({"verbs":["verb"], "templates":["template 1", "template 2"]}) == True
assert check_json_format({"verbs":[], "templates":["template 1", "template 2"]}) == False
assert check_json_format({"verbs":["verb1", "verb2"], "templates":["template 1", "template 2"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":[], "something else":[]}) == False
assert check_json_format({"verbs":["verb1"], "templates":[], "POS":[]}) == False
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} see {{object}}"]}) == False
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} is hungry"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} buy it for {{object-reflective}}"]}) == False
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} buy it for oneself"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":["they warm {{object-intransitive}} at the fire"]}) == True


print("Passed")

In [None]:
def str2json(s):
    """
        convert string to json format and check if the format is valid 
    """
    result = 0
    # clean and remove \n
    s = s.strip().replace("\n", "") 
    # extract {...} using regex
    re_str = r"{.*}"
    json_str = ""
    try:
        json_str = re.findall(re_str, s)[0]
        result = json.loads(json_str)

        if check_json_format(result):
            print("JSON format check OK")
        else:
            print(f"Wrong JSON format. Item = \n{result}")
            return ""
    except:
        # return json_str # return original
        print("Error parsing json =", json_str)
        return ""

    return result
    
ex = str2json(result)
print(ex)
print(ex['verbs'])
print(ex['templates'])

In [None]:
# try a more complex example
s = "s/he is heard running along in snow"
str2json(send_request(f'{prompt_template}: "{s}"', llm))['templates']

In [None]:
# create place holder column
df["llm_templates"] = df["definition"].apply(lambda x: "")
df.head()

In [None]:
llm_func = lambda text: str2json(send_request(f'{prompt_template}: "{text}"', llm))

In [None]:
n = len(df)
print("Len df =", n)
max_row = n # set to n for full set

error_count = 0

for i in range(max_row):
    print(f"Processing row {i+1} / {max_row}, {(i+1)*100/max_row:.0f} %")
    if df.iloc[i]["llm_templates"] != "":
        print("Already processed. Skipping...")
        print("-----------------------------")
        continue
        
    parsed_item = llm_func(df.iloc[i]["definition"].strip().lower())
    
    if parsed_item == "":
        # error parsing?
        print("Error parsing result")
        error_count += 1
        print("Error count so far =", error_count)
        
    print("\tDefinition =", df.iloc[i]["definition"])
    print("\tInput =", df.iloc[i]["definition"])    
    print("\tParsed =", parsed_item)
    print("-----------------------------")
    df.at[i, "llm_templates"] = parsed_item
    
print("Completed")
print("Error count =", error_count)

In [None]:

# debug
s = "s/he gets a blister or blisters on h/ foot"
str2json(send_request(f'{prompt_template}: "{s}"', llm))['templates']

In [None]:
# check for empty result
count = 0
for i in range(len(df)):
    if df.iloc[i]["llm_templates"] == "" or 'templates' not in df.iloc[i]["llm_templates"].keys():
        count += 1
        print("Id =", i)
        print("Definition =", df.iloc[i]["definition"])
        print("LLM parsed text =", df.iloc[i]["llm_templates"])
        print("-----------------")

print("Total count =", count)

In [None]:
# debug
s = "make it rise with heat; pop it (wild rice)"
tmp = llm_func(s)
print(tmp)

In [None]:
# retry failed examples
error_count = 0

for i in range(max_row):
    if df.iloc[i]["llm_templates"] != "":
        continue
        
    parsed_item = llm_func(df.iloc[i]["definition"].strip().lower())
    
    if parsed_item == "":
        # error parsing?
        print("Error parsing result")
        error_count += 1
        print("Error count so far =", error_count)
        
    print("\tDefinition =", df.iloc[i]["definition"])
    print("\tParsed =", parsed_item)
    print("-----------------------------")
    df.at[i, "llm_templates"] = parsed_item
    
print("Completed")
print("Error count =", error_count)

In [None]:
# double check how many empty output
df.query("llm_templates == ''")

In [None]:
output_filename = DATA_PATH + "vai_dict.csv"
df.to_csv(output_filename,
          index=False
          )

output_filename = DATA_PATH + "vai_dict.json"
df.to_json(output_filename,
          orient="records"
          )


print("Completed")