## 01-VTI Pre-processing
In this notebook, we will filter out and does some pre-processing with VTI verbs

In [None]:
!pip install groq # groq package to connect to LLM API on groq.com

In [None]:
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# parse the json string
import json
import re

In [None]:
### First, we load the dictionary
DATA_PATH = "../../../data/"
filename = DATA_PATH + "ob_en_dict.csv"
dict_df = pd.read_csv(filename)
dict_df.head()

In [None]:
filter_tag = "vti"
df = dict_df.query("type.str.contains(@filter_tag)")
df = df.reset_index().drop(columns="index")
print(len(df))
df.head()

In [None]:
# find "s/he" in the definition
term = "s/he"
temp = df.query("definition.str.contains(@term)")
print(len(temp))
temp.head()

In [None]:
# replace shorthands to full forms (e.g. s/he -> he/she, h/ -> him/her)
func = lambda text: (text
                     # .replace(" h/ or it
                     .replace(" s/he", " he/she")
                     .replace(" h/", " him/her")
                     # .replace(" it (animate)", " it")
                    )
print(func("mishear h/"))
print(func("use something of his/hers, use (it) for h/"))

In [None]:
# apply to the dataframe
df["def_normalized"] = df["definition"].apply(func)
df

In [None]:
# using regex to transform it (animate; ...) -> it
import re
s = "warm it (animate; mineral), heat it (animate; mineral)"
r_str = "it \([^)]*\)"
re.sub(r_str, "it", s)

In [None]:
# apply to df
df["def_normalized"] = df["def_normalized"].apply(lambda text: re.sub(r_str, "it", text))
print("Completed")
df

In [None]:
df.query("definition.str.contains('or it')")

In [None]:
# remove 'him/her or it'
df["def_normalized"] = df["def_normalized"].apply(lambda text: (text.replace("him/her or it", "him/her")
                                                                .replace("him/heror it", "him/her")# special case, typo in original definition
                                                                .replace("he/she or it", "he/she")
                                                                .replace("by it or it", "by it") # special case
                                                  
                                                 ))


df.query("def_normalized.str.contains('or it')") # should return close to zero rows

In [None]:
# replace "he/she" with {{object1}}, 
# which means the object serves as subject in a clause
# in example "like hearing him/her, like how he/she sounds"
obj1_f = lambda text: text.replace("he/she", "{{object1}}")

print(obj1_f("like hearing him/her, like how he/she sounds"))
print(obj1_f("hello world")) # should still return "hello world"

In [None]:
# apply to the dataframe
df["def_template"] = df["def_normalized"].apply(obj1_f)
df.head()

In [None]:
# find object "h/" in the definition
term = "him/her"
df.query("def_normalized.str.contains(@term)").head()

In [None]:
# replace object h/ with {{object}}
obj_f = lambda text: text.replace("him/her", "{{object}}")

print(obj_f("like hearing him/her, like how he/she sounds"))
print(obj_f("hello world")) # should still return "hello world"

In [None]:
# apply to the dataframe
df["def_template"] = df["def_template"].apply(obj_f)
df.head()

In [None]:
# replace object 'it' with {{object}}, if the definition **doesn't** already contain {{object}}
obj_it_f = lambda text: (text
                         .replace(" it", " {{object}}")           # normal case
                         .replace("(it)", "{{object}}")           # 'it' inside bracket
                         .replace("one (animate)", "{{object}}")  # special case
                         if "{{" not in text
                         else text
                        )

print(obj_it_f("warm it (animate; mineral)")) # should replace 'it' -> '{{object}}'
print(obj_it_f("warm it (animate; mineral) by {{object}}")) # should NOT replace 'it' 
print(obj_it_f("hello world")) # should still return "hello world"

In [None]:
# apply to the dataframe
df["def_template"] = df["def_template"].apply(obj_it_f)
df.head()

In [None]:
# replace object 'them' with {{object}}
obj_them_f = lambda text: (text.replace("them", "{{object}}")
                           if "{{" not in text
                           else text
                          )

print(obj_them_f("assemble them in a group")) # should replace 'them' -> {{object}}
print(obj_them_f("assemble them in a group by {{object}}")) # should NOT replace 'them'
print(obj_them_f("hello world")) # should still return "hello world"

In [None]:
# apply to the dataframe
df["def_template"] = df["def_template"].apply(obj_them_f)
df.head()

In [None]:
# check if any rows doesn't contain {{object}}
temp_df = df.query("def_template.str.contains('{{') == False")
print(len(temp_df)) # should very close to zero
if len(temp_df) > 0:
    print(temp_df.iloc[0])
temp_df 

In [None]:
# separate definition that has ";"
df.query("def_template.str.contains(';')").head()

In [None]:
# break into multiple definition
s = "retie {{object}}; rewrap {{object}}"
s.split(";")

In [None]:
# apply to df
df["templates"] = df["def_template"].apply(lambda text: text.split(";"))
df

In [None]:
# write to VTA file
output_filename = DATA_PATH + "vti_dict.csv"
df.to_csv(output_filename,
          index=False
          )

print("Completed")

### Using LLM to process meanings
```
Example: a definition of a word is "seek {{object}} or it; go look for {{object}} or it; search for {{object}} or it", if there are multiple meanings, split into multiple template in JSON format: {"verbs": ["seek", "look for", "search for"], "templates":["seek {{object}} or it", "go look for {{object}} or it", "search for {{object}} or it"]}. 
Output JSON format only, no explanation or discussion. Now rephrase a new definition "reach for, feel for {{object}}".
```

In [None]:
import sys  
sys.path.insert(1, '../') # LLM_api.py is in the parent folder

### Groq API key 
Please go to Groq.com to sign up and get an API key, put it into `src/01_data_preprocessing/env/credentials.json` as in the following format

{"GROQ_API_KEY":"your_api_key"
}

In [None]:
import LLM_api
from LLM_api import hello, get_api_key, connect, send_request

hello("API")

In [None]:
api_key = get_api_key()
print(len(api_key))

In [None]:
llm = connect(api_key)

In [None]:
s = "JSON What is the biggest city of New York state?"
send_request(s, llm)

In [None]:
word_def = 'reach for, feel for {{object}}'
# prompt_template = 'Example: a definition of a word is "seek {{object}} or it; go look for {{object}} or it; search for {{object}} or it", if there are multiple meanings, split into multiple template in JSON format: {"verbs": ["seek", "look for", "search for"], "templates":["seek {{object}} or it", "go look for {{object}} or it", "search for {{object}} or it"]}. Output JSON format only, no explanation or discussion, skip subject, prepositions, examples, location, etc. Only keep object and verb. Keep "(it)" as "(it)". Now rephrase a new definition '
prompt_template = """Example: a definition of a word is "seek {{object}} or it; go look for {{object}} or it; search for {{object}} or it", if there are multiple meanings, split into multiple template in JSON format: 
{"verbs": ["seek", "look for", "search for"], "templates":["seek {{object}} or it", "go look for {{object}} or it", "search for {{object}} or it"]}. 
Output JSON format only, no explanation or discussion, skip subject, prepositions, examples, location, etc. Only keep object and verb. Include verbs that appear in the original definition only. Do not invent new verbs. 
Keep the meaning exactly as from the original definition. Now process a new definition
"""
f'{prompt_template}: "{word_def}"'

In [None]:
print(word_def)

In [None]:
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "smudge, cense it; smoke it (for preservation)"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "wish for, hope for, want, desire {{object}} (something difficult to get)"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "see {{object}} at a certain distance"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# try another example
word_def = "carry {{object}} from a certain place on the shoulder"
result = send_request(f'{prompt_template} "{word_def}"', llm)
print(result)

In [None]:
# test searching for {{something}}
s = "{{subject}} want to do something with {{object}} at a {{distance}}"
r = "({{\w+}})"

slots = re.findall(r, s)
print(slots)
print(set(slots).difference({"{{object}}"})) # remove {{object}} from template

In [None]:
def check_json_format(json_obj):
    """
        Check if the Json object format is valid ({"verbs" : [], "templates" : []}
    """
    result = False
    if type(json_obj) != dict:
        print("Wrong data type, expecting json dict object")
        return False

    if len(json_obj.keys()) != 2:
        print("Wrong keys, expecting 2 keys")
        return False

    if set(json_obj.keys()) != {"verbs", "templates"}:
        print(set(json_obj.keys()))
        print("Wrong keys items, expecting verbs and templates")
        return False

    if (json_obj["verbs"] is None or 
        type(json_obj["verbs"]) != list or
        len(json_obj["verbs"]) <= 0
       ):
        print("Wrong verbs, expecting at least 1 verb")
        return False
        
    if (json_obj["templates"] is None or 
        type(json_obj["templates"]) != list or
        len(json_obj["templates"]) <= 0
       ):
        print("Wrong templates, expecting at least 1 template")
        return False

    for template in json_obj["templates"]:
        # look for invalid slots such as {{subject}}, {{distance}}, etc
        r = "({{\w+}})"
        slots = re.findall(r, template)
        if len(set(slots).difference({"{{object}}"})) > 0: # there are more slots than {{object}}
            print("Wrong slots in template =", slots)
            return False
            

    # passed all condition
    result = True
    return result

assert check_json_format(dict()) == False
assert check_json_format({"verbs":["verb"], "templates":["template 1", "template 2"]}) == True
assert check_json_format({"verbs":[], "templates":["template 1", "template 2"]}) == False
assert check_json_format({"verbs":["verb1", "verb2"], "templates":["template 1", "template 2"]}) == True
assert check_json_format({"verbs":["verb1"], "templates":[], "something else":[]}) == False
assert check_json_format({"verbs":["verb1"], "templates":[], "POS":[]}) == False
assert check_json_format({"verbs":["verb1"], "templates":["{{subject}} see {{object}}"]}) == False


print("Passed")

In [None]:


def str2json(s):
    """
        convert string to json format and check if the format is valid 
    """
    result = 0
    # clean and remove \n
    s = s.strip().replace("\n", "") 
    # extract {...} using regex
    re_str = r"{.*}"
    json_str = ""
    try:
        json_str = re.findall(re_str, s)[0]
        result = json.loads(json_str)

        if check_json_format(result):
            print("JSON format check OK")
        else:
            print(f"Wrong JSON format. Item = \n{result}")
            return ""
    except:
        # return json_str # return original
        print("Error parsing json =", json_str)
        return ""

    return result
    
ex = str2json(result)
print(ex)
print(ex['verbs'])
print(ex['templates'])

In [None]:
# try a more complex example
s = "set {{object}} back, put {{object}} back; replace {{object}}"
str2json(send_request(f'{prompt_template}: "{s}"', llm))['templates']

In [None]:
# create place holder column
df["llm_templates"] = df["def_template"].apply(lambda x: "")
df.head()

In [None]:
llm_func = lambda text: str2json(send_request(f'{prompt_template}: "{text}"', llm))

In [None]:
n = len(df)
print("Len df =", n)
max_row = n # set to n for full set

error_count = 0

for i in range(max_row):
    print(f"Processing row {i+1} / {max_row}, {(i+1)*100/max_row:.0f} %")
    if df.iloc[i]["llm_templates"] != "":
        print("Already processed. Skipping...")
        print("-----------------------------")
        continue
        
    parsed_item = llm_func(df.iloc[i]["def_template"].strip().lower())
    
    if parsed_item == "":
        # error parsing?
        print("Error parsing result")
        error_count += 1
        print("Error count so far =", error_count)
        
    print("\tDefinition =", df.iloc[i]["definition"])
    print("\tInput =", df.iloc[i]["def_template"])
    print("\tParsed =", parsed_item)
    print("-----------------------------")
    df.at[i, "llm_templates"] = parsed_item
    
print("Completed")
print("Error count =", error_count)

In [None]:
count = 0
for i in range(len(df)):
    # check for invalid LLM output (empty, or dictionary has different formats
    if df.iloc[i]["llm_templates"] == "" or 'templates' not in df.iloc[i]["llm_templates"].keys():
        count += 1
        print("Id =", i)
        print("Definition =", df.iloc[i]["definition"])
        print("Definition normalized =", df.iloc[i]["def_normalized"])
        print("Definition extracted by code =", df.iloc[i]["templates"])
        print("Definition template (used as input for LLM) =", df.iloc[i]["def_template"])
        print("LLM parsed text =", df.iloc[i]["llm_templates"])
        print("-----------------")

print("Total count =", count)

In [None]:
# retry failed examples
error_count = 0

for i in range(max_row):
    if df.iloc[i]["llm_templates"] != "":
        continue
        
    parsed_item = llm_func(df.iloc[i]["def_template"].strip().lower())
    
    if parsed_item == "":
        # error parsing?
        print("Error parsing result")
        error_count += 1
        print("Error count so far =", error_count)
        
    print("\tDefinition =", df.iloc[i]["definition"])
    print("\tParsed =", parsed_item)
    print("-----------------------------")
    df.at[i, "llm_templates"] = parsed_item
    
print("Completed")
print("Error count =", error_count)

In [None]:
# write to VTA file
output_filename = DATA_PATH + "vti_dict.csv"
df.to_csv(output_filename,
          index=False
          )

output_filename = DATA_PATH + "vti_dict.json"
df.to_json(output_filename,
          orient="records"
          )


print("Completed")