Note: The script uses Berkeley Neural Parser to parse the generated instructions, and visualize the results using Plotly.

Please make sure to install benepar following their documentation [here](https://github.com/nikitakit/self-attentive-parser#installation).

In [1]:
import benepar, spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("The time for action is now. It's never too late to do something.")

if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def find_root_verb_and_its_dobj(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.pos_ == "VERB":
        for child in tree_root.children:
            if child.dep_ == "dobj" and child.pos_ == "NOUN":
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj(child)
    # if no children satisfy the condition, return None
    return None, None

def find_root_verb_and_its_dobj_in_string(s):
    doc = nlp(s)
    first_sent = list(doc.sents)[0]
    return find_root_verb_and_its_dobj(first_sent.root)

find_root_verb_and_its_dobj_in_string("Write me a story about education.")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


('write', 'story')

In [3]:
import pandas as pd
import json
import tqdm

generated_data_path = "instruction_io/data_io/starcoder_generations/machine_generated_instructions.jsonl" # Replace this with your own data path
machine_generated_tasks = []
with open(generated_data_path, "r") as fin:
    for line in fin:
        machine_generated_tasks.append(json.loads(line))
instructions = set([task["instruction"] for task in machine_generated_tasks])
print("The total amount of generated instructions is "+str(len(instructions)))

The total amount of generated instructions is 5003


In [4]:
raw_phrases = []
for instruction in tqdm.tqdm(instructions):
    try:
        verb, noun = find_root_verb_and_its_dobj_in_string(instruction)
        raw_phrases.append({
            "verb": verb,
            "noun": noun,
            "instruction": instruction
        })
    except Exception as e:
        print(e)
        print(instruction)

100%|██████████| 5003/5003 [04:15<00:00, 19.58it/s]


In [5]:
len(raw_phrases)

5003

In [6]:
raw_phrases = pd.DataFrame(raw_phrases)
phrases = pd.DataFrame(raw_phrases).dropna()
phrases[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)

verb    noun       
write   program        452
        script         307
create  function       293
write   function       204
        code           198
                      ... 
find    salesperson      1
        rectangle        1
        rating           1
        product          1
write   way              1
Length: 423, dtype: int64

In [7]:
top_verbs = phrases[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()

df = phrases[phrases["verb"].isin(top_verbs["verb"].tolist())]
# df = df[~df["noun"].isin(["I", "what"])]
# df = phrases
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
df = df.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
# df = df[df["count"] > 10]
df = df.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)
df

Unnamed: 0,verb,noun,count
0,calculate,area,4
1,calculate,sum,2
2,calculate,volume,2
3,calculate,circumference,2
4,complete,segment,6
...,...,...,...
75,show,seller,1
76,write,program,452
77,write,script,307
78,write,function,204


In [9]:

import plotly.graph_objects as go
import plotly.express as px

# df["blank"] = "ROOT"
# df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})

df = df[df["count"] > 30]
fig = px.sunburst(df, path=['verb', 'noun'], values='count')
# fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
fig.update_layout(
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Times New Roman",
)
fig.show()
#fig.write_html("verb_noun.html")
fig.write_image("figure.png")