## 01-VTA Pre-processing
In this notebook, we will filter out and does some pre-processing with VTA verbs

In [None]:
!pip install groq # groq package to connect to LLM API on groq.com

In [None]:
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# parse the json string
import json
import re

In [None]:
### First, we load the dictionary
DATA_PATH = "../../../data/"
filename = DATA_PATH + "ob_en_dict.csv"
dict_df = pd.read_csv(filename)
dict_df.head()

In [None]:
filter_tag = "vta"
df = dict_df.query("type.str.contains(@filter_tag)")
df = df.reset_index().drop(columns="index")
print(len(df))
df.head()

### Using LLM to process meanings


In [None]:
# dictionary preview
i = 200
print(df.iloc[i]["definition"])

In [None]:
import sys  
sys.path.insert(1, '../') # LLM_api.py is in the parent folder

### Groq API key 
Please go to Groq.com to sign up and get an API key, put it into `src/01_data_preprocessing/env/credentials.json` as in the following format

{"GROQ_API_KEY":"your_api_key"
}

In [None]:
import LLM_api
from LLM_api import hello, get_api_key, connect, send_request

hello("API")

In [None]:
api_key = get_api_key()
print(len(api_key))

In [None]:
llm = connect(api_key)

In [None]:
s = "JSON What is the biggest city of New York state?"
send_request(s, llm)

In [None]:
prompt_template = """A given definition example: d =  "smudge, cense h/; smoke h/ (for preservation)". 
Analyze the definition d. What is subject and object? Rewrite definition by replacing subject and object by literal `{{subject}}` and `{{object}}`.  
Replace verbs to infinitive form (e.g. wants -> want, is -> be, gets -> get).
Answer in form {"verbs":[], "templates":[]}. Split the definition for each main verb. 
Note the words like "something" or "(it)", don't parse them as "{{object}}", keep them as literal. 
Translate "h/ or it" to "{{object}}".
Extract the main verbs only, if the sentence is in passive voice, the main verb is "be". The answer for definition d should be in JSON format 
output = {verbs:["smudge", "cense", "smoke"],"templates":["{{subject}} smudge {{object}}", "{{subject}} cense {{object}}", ""{{subject}} smoke {{object}} (for preservation)"]}. 
Do not invent new verbs. Keep the new definitions literally close as the original definition. Keep things in brackets as literal, e.g. (it), (something) or (by someone). 

Bellow are more examples:

Definition = pull h/ aboard
Output = {'verbs': ['pull'], 'templates': ['{{subject}} pull {{object}} aboard']}
--------------------
Definition = fix, repair (it) for h/
Output = {'verbs': ['fix', 'repair'], 'templates': ['{{subject}} fix (it) for {{object}}', '{{subject}} repair (it) for {{object}}']}
--------------------
Definition = throw h/ aboard
Output = {'verbs': ['throw'], 'templates': ['{{subject}} throw {{object}} aboard']}
--------------------
Definition = cool h/ with water
Output = {'verbs': ['cool'], 'templates': ['{{subject}} cool {{object}} with water']}
--------------------
Definition = cook it (animate)
Output = {'verbs': ['cook'], 'templates': ['{{subject}} cook {{object}} (animate)']}
--------------------
Definition = throw (it) here to h/
Output = {'verbs': ['throw'], 'templates': ['{{subject}} throw (it) here to {{object}}']}
--------------------
Definition = cut it (animate; sheet-like) short
Output = {'verbs': ['cut'], 'templates': ['{{subject}} cut {{object}} ((animate; sheet-like) short ']}
--------------------
Definition = cut it (animate) so wide
Output = {'verbs': ['cut'], 'templates': ['{{subject}} cut {{object}} (animate) so wide']}
--------------------
Definition = staunch h/ bleeding
Output = {'verbs': ['staunch'], 'templates': ['{{subject}} staunch {{object}} bleeding']}
--------------------
Definition = ride mounted on top of h/; sit astride h/
Output = {'verbs': ['ride', 'sit'], 'templates': ['{{subject}} ride mounted on top of {{object}}', '{{subject}} sit astride {{object}}']}
--------------------
Definition = warm something (liquid) up for h/
Output = {'verbs': ['warm'], 'templates': ['{{subject}} warm something (liquid) up for {{object}}']}
--------------------
Definition = warm something for h/ at the fire
Output = {'verbs': ['warm'], 'templates': ['{{subject}} warm something for {{object}} at the fire']}
--------------------
Definition = warm h/ foot or feet
Output = {'verbs': ['warm'], 'templates': ['{{subject}} warm {{object-possessive}} foot or feet']}
--------------------
Definition = catch up to h/ following h/ tracks or trail
Output = {'verbs': ['catch'], 'templates': ['{{subject}} catch up to {{object}} following {{object-possessive}} tracks or trail']}
--------------------
Definition = dye, color h/ or it (animate)
Output = {'verbs': ['dye', 'color'], 'templates': ['{{subject}} dye {{object}} (animate)', '{{subject}} color {{object}} (animate)']}
--------------------
Definition = dye, color (it) for h/
Output = {'verbs': ['dye', 'color'], 'templates': ['{{subject}} dye (it) for {{object}}', '{{subject}} color (it) for {{object}}']}
--------------------

Now process a new definition
"""


In [None]:
# word_def = "warm it (animate; mineral), heat it (animate; mineral) up"
word_def = "dye, color h/ (animate)"

In [None]:
print(word_def)

In [None]:
print(f'{prompt_template}: "{word_def}"')

In [None]:
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
def check_json_format(json_obj):
    """
        Check if the Json object format is valid ({"verbs" : [], "templates" : []}
    """
    result = False
    if type(json_obj) != dict:
        print("Wrong data type, expecting json dict object")
        return False

    if len(json_obj.keys()) != 2:
        print("Wrong keys, expecting 2 keys")
        return False

    if set(json_obj.keys()) != {"verbs", "templates"}:
        print(set(json_obj.keys()))
        print("Wrong keys items, expecting verbs and templates")
        return False

    if (json_obj["verbs"] is None or 
        type(json_obj["verbs"]) != list or
        len(json_obj["verbs"]) <= 0
       ):
        print("Wrong verbs, expecting at least 1 verb")
        return False
        
    if (json_obj["templates"] is None or 
        type(json_obj["templates"]) != list or
        len(json_obj["templates"]) <= 0
       ):
        print("Wrong templates, expecting at least 1 template")
        return False

    for template in json_obj["templates"]:
        # look for invalid slots such as {{subject}}, {{distance}}, etc
        r = "({{[\w|-]+}})"
        slots = re.findall(r, template)
        vai_slots = {"{{subject}}", "{{object}}", "{{object-possessive}}"} # slots fro VTA verbs
        if len(set(slots).difference(vai_slots)) > 0: 
            print("Wrong slots in template =", slots)
            return False
            

    # passed all condition
    result = True
    return result

assert check_json_format(dict()) == False
assert check_json_format({"verbs":["verb"], "templates":["template 1", "template 2"]}) == True
assert check_json_format({"verbs":[], "templates":["template 1", "template 2"]}) == False
assert check_json_format({"verbs":["verb1", "verb2"], "templates":["template 1", "template 2"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":[], "something else":[]}) == False
assert check_json_format({"verbs":["verb1"], "templates":[], "POS":[]}) == False
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} see {{object}}"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} is hungry"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} buy it for {{object}}"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} buy it for {{adjective}}"]}) == False
assert check_json_format({"verbs":["verb1"], "templates":["they warm {{object-possessive}} feet at the fire"]}) == True


print("Passed")

In [None]:
def str2json(s):
    """
        convert string to json format and check if the format is valid 
    """
    result = 0
    # clean and remove \n
    s = s.strip().replace("\n", "") 
    # extract {...} using regex
    re_str = r"{.*}"
    json_str = ""
    try:
        json_str = re.findall(re_str, s)[0]
        result = json.loads(json_str)

        if check_json_format(result):
            print("JSON format check OK")
        else:
            print(f"Wrong JSON format. Item = \n{result}")
            return ""
    except:
        print("Error parsing json =", json_str)
        return ""

    return result
    
ex = str2json(result)
print(ex)
print(ex['verbs'])
print(ex['templates'])

In [None]:
# try a more complex example
s = "attach it (animate) (using something); sew it (animate) on"
str2json(send_request(f'{prompt_template}: "{s}"', llm))['templates']

In [None]:
# create place holder column
df["llm_templates"] = df["definition"].apply(lambda x: "")
df.head()

In [None]:
llm_func = lambda text: str2json(send_request(f'{prompt_template}: "{text}"', llm))

### Sampling data for few-shot training

In [None]:
tmp = df.sample(n=10, random_state=42)
for i in range(len(tmp)):
    item = tmp.iloc[i]
    print("Lemma =", item["lemma"])
    print("Definition =", item["definition"])
    print("Type =", item["type"])
    print("-"*20)

In [None]:
print("Running on the sampled set, used for few-shot learning...")
for i in range(len(tmp)):
    item = tmp.iloc[i]
    print("Lemma =", item["lemma"])
    print("Definition =", item["definition"])
    print("Type =", item["type"])

    parsed_item = llm_func(tmp.iloc[i]["definition"].strip().lower())

    print("Output =", parsed_item)
    print("-"*20)

##### We will manually adjust the templates and put back into the prompt
When all output for sampled set look good, we will proceed to the actual dictionary

### Actual processing on the dictionary

In [None]:
n = len(df)
print("Len df =", n)
max_row = n # set to n for full set

error_count = 0
start_id = 20 if max_row < n else 0 # for debug purpose

for i in range(start_id, max_row):
    # save check-point data
    if (i+1) % 50 == 0:
        print("Writing checkpoint...")
        output_filename = DATA_PATH + "vta_dict_checkpoint.csv"
        df.to_csv(output_filename,
                  index=False
                  )
        print("OK")
    
    print(f"Processing row {i+1} / {max_row}, {(i+1)*100/max_row:.0f} %")
    if df.iloc[i]["llm_templates"] != "":
        print("Already processed. Skipping...")
        print("-----------------------------")
        continue
        
    parsed_item = llm_func(df.iloc[i]["definition"].strip().lower())
    
    if parsed_item == "":
        # error parsing?
        print("Error parsing result")
        error_count += 1
        print("Error count so far =", error_count)
        
    print("\tDefinition =", df.iloc[i]["definition"])
    print("\tInput =", df.iloc[i]["definition"])    
    print("\tParsed =", parsed_item)
    print("-----------------------------")
    df.at[i, "llm_templates"] = parsed_item
    
print("Completed")
print("Error count =", error_count)

In [None]:
# check for empty result
count = 0
for i in range(len(df)):
    if df.iloc[i]["llm_templates"] == "" or 'templates' not in df.iloc[i]["llm_templates"].keys():
        count += 1
        print("Id =", i)
        print("Definition =", df.iloc[i]["definition"])
        print("LLM parsed text =", df.iloc[i]["llm_templates"])
        print("-----------------")

print("Total count =", count)

In [None]:
# retry failed examples
error_count = 0

for i in range(max_row):
    if df.iloc[i]["llm_templates"] != "":
        continue
        
    parsed_item = llm_func(df.iloc[i]["definition"].strip().lower())
    
    if parsed_item == "":
        # error parsing?
        print("Error parsing result")
        error_count += 1
        print("Error count so far =", error_count)
        
    print("\tDefinition =", df.iloc[i]["definition"])
    print("\tParsed =", parsed_item)
    print("-----------------------------")
    df.at[i, "llm_templates"] = parsed_item
    
print("Completed")
print("Error count =", error_count)

In [None]:
# double check how many empty output
df.query("llm_templates == ''")

In [None]:
output_filename = DATA_PATH + "vta_dict.csv"
df.to_csv(output_filename,
          index=False
          )

output_filename = DATA_PATH + "vta_dict.json"
df.to_json(output_filename,
          orient="records"
          )


print("Completed")