<a href="https://colab.research.google.com/github/lennertjansen/nlp2/blob/master/project1/datageneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount notebook to drive

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
%cd drive/My\ Drive/nlp2/code
!ls

[Errno 2] No such file or directory: 'drive/My Drive/nlp2/code'
/content/drive/.shortcut-targets-by-id/1QXwx2THcNLhcOFb0uOdUp12fL_lxsei0/nlp2/code
'Colab Tutorial'   OpenNMT-py


In [0]:
import numpy as np

In [26]:
"""
 To generate our dataset, we need a few things:
 1. Grammar rules (So a PCFG) with all grammar rules
 2. Lexical rules (a vocabulary)
 3. A generation mechanism, that probabilistically samples how to fill in each instance, and that can stop. Should also be able to give metadata to each datapoint 
 4. Potentially a function that checks if the grammar is correct (could also be done by hand)
 5. A translation function, that converts each generated sample into its correct interpretation. This needs interpretation rules, and a lookup table
 Notes:
 - Use nested dict for lookup table
 """

'\n To generate our dataset, we need a few things:\n 1. Grammar rules (So a PCFG) with all grammar rules\n 2. Lexical rules (a vocabulary)\n 3. A generation mechanism, that probabilistically samples how to fill in each instance, and that can stop. Should also be able to give metadata to each datapoint \n 4. Potentially a function that checks if the grammar is correct (could also be done by hand)\n 5. A translation function, that converts each generated sample into its correct interpretation. This needs interpretation rules, and a lookup table\n Notes:\n - Use nested dict for lookup table\n '

In [0]:
class PCFG_rule:
  def __init__(self, lhs, rhs, prob):
    self.lhs = lhs
    self.rhs = rhs
    self.prob = prob
  
  def __str__(self):
    if isinstance(self.rhs, list):
      return ("({}) {} --> {}".format( self.prob, ' '.join(self.lhs), ' '.join(self.rhs) ))
    else:
      return ("({}) {} --> {}".format( self.prob, ' '.join(self.lhs), self.rhs ))

class PCFG:
  def __init__(self):
    self.rules = {}

  def add(self, lhs, rhs, prob):
    rule = PCFG_rule(lhs, rhs, prob)
    if isinstance(lhs,list):
      lhs = tuple(lhs) # Tuple is hashable
    if lhs in self.rules.keys():
      self.rules[lhs].append(rule)
    else:
      self.rules[lhs] = [rule]

  def select(self, lhs):
    lhs = tuple(lhs)
    return self.rules[lhs]

  def sample(self, lhs):
    # print(lhs, len(lhs))
    assert not ((isinstance(lhs, list) or isinstance(lhs, tuple)) and len(lhs)>1), "Only single words or non-terminals can be filled in."
    if isinstance(lhs, list):
      lhs = tuple(lhs)
    # print("Sampling for ", lhs)
    if not self.rules.get(lhs):
      return lhs, True
    rules = self.rules.get(lhs)
    # print(rules)
    probs = [rule.prob for rule in rules]
    # print("probabilities: ", probs)
    number_of_options = len(rules)

    choice_index = np.random.choice(a = number_of_options, p = probs)
    # print("Selected option: ", rules[choice_index].rhs)
    return rules[choice_index].rhs, False

  def generate(self, sequence):
    # print("Generating for starting sequence ", sequence)
    finished = False
    while not finished:
      finished_list = [False]*len(sequence)
      new_sequence = []
      for i, lhs in enumerate(sequence):
        rhs, finished_list[i] = ruleset.sample(lhs)
        # disambiguate between non-terminal and lexical rules:
        if isinstance(rhs, list): # Non-terminal rules
          new_sequence.extend(rhs)
        elif isinstance(rhs, str): # Lexical rules
          new_sequence.append(rhs)
      sequence = new_sequence
      # print("New sequence: ", sequence)
      if all(finished_list):
        finished = True
    return sequence
      


  def __str__(self):
    string = ''
    for key in self.rules.keys():
      for item in self.rules[key]:
        # print(item)
        string += item.__str__()
        # print(item.__str__())
        string += '\n'
    return string

In [64]:
# 1: grammar rules. We will encode this in an array? dict? tuple? 
# dict - tuple: prob (where tuple - [lhs , rhs] for lhs --> rhs)
# Better option: python array, with [lhs, rhs, prob] for each cfg rule
# Where lhs is a list of each character in lhs, and rhs same for rhs
# Actually, best option is to use a dict. Since we will be generating, the lhs should be the key.
# So: dict of {lhs: [[rhs1, prob1], [rhs2, prob2]]}
# rule = PCFG_rule()
ruleset = PCFG()

# Non-terminal rules:

lhs = 'S' # valid sequence is an object: S --> O
rhs = ['(', 'A', 'O', ')']
prob = 1
ruleset.add(lhs, rhs, prob)
# ( S A S with S )
lhs = 'O' # O --> A O
rhs = ['(', 'A', 'O', ')']
prob = 0.15
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> A T O
rhs = ['(', 'A', 'T', 'O', ')']
prob = 0.05
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> N O
rhs = ['(', 'N', 'O', ')']
prob = 0.15
ruleset.add(lhs, rhs, prob)

# lhs = 'A' # A --> A N

lhs = 'O' # O --> O & O
rhs = ['(', 'O', '&', 'O', ')']
prob = 0.2
ruleset.add(lhs, rhs, prob)



lhs = 'T' # T --> W H
rhs = ['(', 'W', 'H', ')']
prob = 1
ruleset.add(lhs, rhs, prob)

# lhs = ['A', 'O'] # A O --> A O & O
# rhs = ['A', 'O', '&', 'O']
# prob = 0.4
# ruleset.add(lhs, rhs, prob)


print('All non-terminal rules: \n', ruleset)

# Problem:
# cut tomato and cut steak should be a valid S
# S --> A O & A O 
# These should also be possibly nested in another action, so
# A O --> A A O & A O
# This reduces to O --> A O & A O, which reduces to O --> O & O 
# Above is true since O --> A O
# O --> O & O should not be possible (why not?)
# This causes line 47 to be ambiguous - unless we use a 'then' word in addition to and
# 'then' would indicate the end of the scope of an action, where 'and' would include the next object

All non-terminal rules: 
 (1) S --> ( A O )
(0.15) O --> ( A O )
(0.05) O --> ( A T O )
(0.15) O --> ( N O )
(0.2) O --> ( O & O )
(1) T --> ( W H )



In [65]:
# All lexical rules:

# Object rules:
lhs = 'O'
objects = ['tomato', 'steak', 'onion', 'potato', 'chicken', 'pork']
obj_probs = [0.1, 0.1, 0.1, 0.05, 0.05, 0.05] # We don't like chicken and pork
# Note: will the eventual model be largely carnivorous or herbivorous?
for rhs, prob in zip(objects, obj_probs):
  ruleset.add(lhs, rhs, prob)
# print(ruleset)

# Action rules:
lhs = 'A'
actions = ['cut', 'fry', 'grill', 'clean', 'boil']
act_probs = [0.3, 0.2, 0.2, 0.2, 0.1]
for rhs, prob in zip(actions, act_probs):
  ruleset.add(lhs, rhs, prob)

# Number rules:
lhs = 'N'
numbers = ['one', 'two', 'three']
num_probs = [0.5, 0.4, 0.1]
for rhs, prob in zip(numbers, num_probs):
  ruleset.add(lhs, rhs, prob)

# Connective rules:
lhs = '&'
connectives = ['and', 'after', 'then']
conn_probs = [0.4, 0.2, 0.4]
for rhs, prob in zip(connectives, conn_probs):
  ruleset.add(lhs, rhs, prob)

# With rule:
lhs = 'W'
w = ['with']
w_prob = 1
ruleset.add(lhs, w, w_prob)

# Tool rules:
# lhs = ['H']
# tools = ['vegknife', 'meatknife', 'fryingpan', 'grill', 'boil', 'season']
# conn_probs = [0., 0.2, 0.4]
# for rhs, prob in zip(connectives, conn_probs):
#   ruleset.add(lhs, rhs, prob)
# How do we make sure these only appear in the correct context?
# Probably easiest to filter the wrong ones out?
# Or use context based rules for generation. Or even substitute actions as 'action with tool'
# strings? Probably context based rules is best, based on intuition.
# Probably we should implement the tools in a different function than in the PCFG

print(ruleset)

(1) S --> ( A O )
(0.15) O --> ( A O )
(0.05) O --> ( A T O )
(0.15) O --> ( N O )
(0.2) O --> ( O & O )
(0.1) O --> tomato
(0.1) O --> steak
(0.1) O --> onion
(0.05) O --> potato
(0.05) O --> chicken
(0.05) O --> pork
(1) T --> ( W H )
(0.3) A --> cut
(0.2) A --> fry
(0.2) A --> grill
(0.2) A --> clean
(0.1) A --> boil
(0.5) N --> one
(0.4) N --> two
(0.1) N --> three
(0.4) & --> and
(0.2) & --> after
(0.4) & --> then
(1) W --> with



In [340]:
# Now, part two: generation
# We will generate grammatically correct sequences, where we only leave the tools not filled in.
# We always start from an S, and then randomly pick a way to fill it in, recursively
sequences = []
for i in range(20):
  sequence = ['S']
  sequence = ruleset.generate(sequence)
  sequences.append(sequence)
  print("Sequence {}: {}".format(i,sequence))

""" Observations 8/04:
Sequence 0: ['cut', 'three', 'onion']
Sequence 1: ['fry', 'fry', 'with', 'H', 'clean', 'five', 'pork', 'then', 'tomato']
Sequence 2: ['cut', 'steak']
Sequence 3: ['clean', 'tomato', 'and', 'clean', 'four', 'tomato']
Sequence 4: ['fry', 'pork', 'and', 'steak', 'and', 'clean', 'one', 'one', 'cut', 'tomato', 'and', 'fry', 'pork']
Sequence 5: ['grill', 'steak', 'and', 'two', 'tomato', 'and', 'onion', 'after', 'chicken']
Sequence 6: ['clean', 'pork']
Sequence 7: ['fry', 'potato']
Sequence 8: ['cut', 'potato']
Sequence 9: ['cut', 'potato']
Sequence 10: ['cut', 'fry', 'fry', 'pork']
Sequence 11: ['fry', 'potato']
Sequence 12: ['grill', 'two', 'potato', 'then', 'tomato']
Sequence 13: ['clean', 'potato']
Sequence 14: ['grill', 'five', 'one', 'tomato']
Sequence 15: ['fry', 'steak', 'then', 'pork']
Sequence 16: ['grill', 'fry', 'fry', 'potato', 'and', 'pork', 'then', 'fry', 'with', 'H', 'potato', 'then', 'steak']
Sequence 17: ['cut', 'steak']
Sequence 18: ['boil', 'grill', 'pork', 'then', 'tomato']
Sequence 19: ['fry', 'tomato', 'and', 'cut', 'one', 'potato', 'then', 'onion']

We see sometimes a double number. This is kind of weird for human language, but should be OK in our task.
Also we see single objects, like sequence 18. This should be interpreted as calling the action (boil grill) on both objects
"""

Sequence 0: ['(', 'clean', 'pork', ')']
Sequence 1: ['(', 'clean', '(', 'cut', '(', 'with', 'H', ')', 'tomato', ')', ')']
Sequence 2: ['(', 'clean', '(', 'grill', '(', 'cut', 'chicken', ')', ')', ')']
Sequence 3: ['(', 'boil', 'pork', ')']
Sequence 4: ['(', 'fry', '(', 'one', '(', 'onion', 'after', '(', 'one', 'onion', ')', ')', ')', ')']
Sequence 5: ['(', 'fry', '(', 'boil', '(', 'one', '(', 'clean', '(', 'boil', '(', 'with', 'H', ')', 'chicken', ')', ')', ')', ')', ')']
Sequence 6: ['(', 'fry', 'pork', ')']
Sequence 7: ['(', 'fry', 'tomato', ')']
Sequence 8: ['(', 'clean', '(', 'one', '(', 'three', '(', 'fry', '(', 'fry', '(', 'with', 'H', ')', '(', 'cut', '(', 'steak', 'then', 'onion', ')', ')', ')', ')', ')', ')', ')']
Sequence 9: ['(', 'grill', '(', 'pork', 'then', '(', 'two', '(', 'cut', '(', 'one', '(', 'cut', 'pork', ')', ')', ')', ')', ')', ')']
Sequence 10: ['(', 'fry', 'steak', ')']
Sequence 11: ['(', 'cut', '(', 'boil', 'chicken', ')', ')']
Sequence 12: ['(', 'clean', '(', 

" Observations 8/04:\nSequence 0: ['cut', 'three', 'onion']\nSequence 1: ['fry', 'fry', 'with', 'H', 'clean', 'five', 'pork', 'then', 'tomato']\nSequence 2: ['cut', 'steak']\nSequence 3: ['clean', 'tomato', 'and', 'clean', 'four', 'tomato']\nSequence 4: ['fry', 'pork', 'and', 'steak', 'and', 'clean', 'one', 'one', 'cut', 'tomato', 'and', 'fry', 'pork']\nSequence 5: ['grill', 'steak', 'and', 'two', 'tomato', 'and', 'onion', 'after', 'chicken']\nSequence 6: ['clean', 'pork']\nSequence 7: ['fry', 'potato']\nSequence 8: ['cut', 'potato']\nSequence 9: ['cut', 'potato']\nSequence 10: ['cut', 'fry', 'fry', 'pork']\nSequence 11: ['fry', 'potato']\nSequence 12: ['grill', 'two', 'potato', 'then', 'tomato']\nSequence 13: ['clean', 'potato']\nSequence 14: ['grill', 'five', 'one', 'tomato']\nSequence 15: ['fry', 'steak', 'then', 'pork']\nSequence 16: ['grill', 'fry', 'fry', 'potato', 'and', 'pork', 'then', 'fry', 'with', 'H', 'potato', 'then', 'steak']\nSequence 17: ['cut', 'steak']\nSequence 18: [

In [341]:
# In this cell, we define the lookup table for the tools pertaining to (object, action) pairs. We do this through nested dicts
tools = {
  'cut': {
        'onion': 'veg-knife',
        'tomato': 'veg-knife',
        'steak': 'meat-knife',
        'potato': 'veg-knife',
        'chicken': 'chicken-knife',
        'pork': 'meat-knife'
    },
  'fry': {
      'onion': 'frying-pan',
      'tomato': 'frying-pan',
      'steak': 'frying-pan',
      'potato': 'frying-pan',
      'chicken': 'frying-pan',
      'pork': 'frying-pan'
  },
  'grill': {
      'onion': 'griddle',
      'tomato': 'griddle',
      'steak': 'skillet',
      'potato': 'skillet',
      'chicken': 'griddle',
      'pork': 'skillet'
  },
  'clean': {
      'onion': 'peeler',
      'tomato': 'water',
      'steak': 'fillet-knife',
      'potato': 'peeler',
      'chicken': 'fillet-knife',
      'pork': 'fillet-knife'
  },
  'boil': {
      'onion': 'soup-pot',
      'tomato': 'soup-pot',
      'steak': 'stew-pot',
      'potato': 'stew-pot',
      'chicken': 'soup-pot',
      'pork': 'stew-pot'
  },
  'season': {
      'onion': 'salt',
      'tomato': 'pepper',
      'steak': 'rosemary',
      'potato': 'salt',
      'chicken': 'pepper',
      'pork': 'pepper'
  },
}

tool_list = list(dict.fromkeys([value for action in tools.values() for value in action.values() ] ) )
print(tool_list)

['veg-knife', 'meat-knife', 'chicken-knife', 'frying-pan', 'griddle', 'skillet', 'peeler', 'water', 'fillet-knife', 'soup-pot', 'stew-pot', 'salt', 'pepper', 'rosemary']


In [342]:
""" Next, we need to fill in the right tools. This uses some context. 
 We should loop through each sentence, and wherever there is an open H, fill it in according to the action and next object
 Problem: some actions have tools assigned but operate on multiple objects
 --> actions with tools should operate only on single objects.
 Solution: remove everything until nesting ends for those sentences - so until the & before next action
 - This should actually be encoded in the context free grammar. However, this requires two terms in the lhs, so we circumvent that by filtering like this
 """

action_found = False
H_found = False
with_found = False
remove = False
with_index = 0
remove_ending = 0
# print(sequences)
# for seq_index, seq in enumerate(sequences):
#   print(seq, seq_index)
#   for step in range(seq.count('H')): # Do this for each H in the sequence
#     # We loop through all sequences. Seq is now a list of words, with sometimes an H.
#     for token_index, token in enumerate(seq):
#       if token in actions:
#         action = token
#         action_found = True
#         continue
#       if not ((action_found and (token == 'with')) or with_found):
#         # print("With not found")
#         action_found = False
#       elif token == 'with':
#         with_found = True
#         with_index = token_index
#         # print("With found!", action_found, with_found)
#         continue
#       # if (action_found and with_found):
#       #   print("token after with found and action found: ", token)
#       if action_found and with_found and (token == 'H'):
#         # print("H found")
#         H_index = token_index
#         H_found = True
#         continue
#       if action_found and H_found and (token in objects):
#         print("Found token after H and action in sequence: \n", seq)
#         obj = token
#         print("object: ", obj)
#         print("Action: ", action) # these can be used to look up what's needed. Then we remove the tokens up to the next action.
#         tool = tools[action][obj]
#         print("Tool:", tool)
#         seq[H_index] = tool
#         remove = True
#         remove_start = token_index+1
#         rest_of_sequence = seq[remove_start:]
#         print(seq[remove_start:])
#         remove_end = [index-1 for index, token in enumerate(rest_of_sequence) if (token in actions and rest_of_sequence[index-1] in connectives) ]
#         if len(remove_end)>0:
#           remove_ending = remove_end
#         print(remove_ending)
#         # print("a: ", seq[:remove_start])
#         # print("b: ", seq[remove_start+remove_ending:])
#         sequences[seq_index] = seq[:remove_start].extend(seq[remove_start+remove_ending:])

#         action_found = False
#         H_found = False
#         print(sequences[seq_index])

# OK let's try this again. 
# 1.We need to find the H in the sequence.
# 2. Then we need to find the action that preceeds it
# 3. Then we need to find the object that follows it
# 4. Then we need to remove everything up until the next '& action'
for seq_index, seq in enumerate(sequences):
  if 'H' in seq:
    for step in range(seq.count('H')):
      print(seq, seq_index)
      # print("Found H")
      H_index = seq.index('H') # This is the index of the first H in the sequence
      
      # print([(token in actions) for token in seq[::-1]])
      # print("Action check sequence: ", seq[H_index::-1])
      
      action_index = H_index - [(token in actions) for token in seq[H_index::-1]].index(True) # This should find the last action before the H. Instead, it finds the last action
      # Create a boolean list of whether something is an action, then select 
      # We know that before an action, there is always an opening bracket. So if we find the closing bracket for it, we have the end of the exclusion)
      # print(H_index, action_index)
      
      object_index = H_index + [(token in objects) for token in seq[H_index:]].index(True) # Finds first object after H
      # For exclusion, we need to find the first pair of (& A) after the just found object
      
      # print("Next action sequence: ", seq[object_index:])
      
      next_action = [(token in actions and seq[object_index+token_index-1] in connectives) for token_index, token in enumerate(seq[object_index:])]
      bracket_status = 0
      exclusion_end = len(seq)-1
      for token_index, token in enumerate(seq[action_index:]):
        if token == '(':
          bracket_status += 1
        elif token == ')':
          bracket_status -= 1
        
        if bracket_status < 0:
          exclusion_end = action_index + token_index
          break

      # print("Found action: ", [(token in actions) for token_index, token in enumerate(seq[object_index:])])
      # print("Found connective: ",[(seq[object_index+token_index-1] in connectives) for token_index, token in enumerate(seq[object_index:])])
      # print(next_action)
      # Exclusion starts directly after the object, if there is a connective. It ends at the first 'new' closing bracket (which closes the scope of the action)
      # if any(next_action):
      #   exclusion_end = object_index + next_action.index(True) - 2
      #   print("end of exclusion: ", seq[exclusion_end])
      # else:
      # The line above finds the action, so we need to remove everything between 2 places before that and the object.
      
      # print("H index {}, action index {}, object index {}, exclusion end {}".format(H_index, action_index, object_index, exclusion_end))
      
      # First, we fill in H:
      obj = seq[object_index]
      action = seq[action_index]
      tool = tools[action][obj]
      seq[H_index] = tool
      new_sequence = seq[0:object_index+1]
      new_sequence.extend(seq[exclusion_end+1:])
      # Make sure brackets are balanced:
      num_opening = new_sequence.count('(')
      num_closing = new_sequence.count(')')
      if num_closing<num_opening:
        new_sequence.extend([')']*(num_opening-num_closing))
      
      # print("New sequence pt 1: ", seq[0:object_index+1])
      # print("New sequence pt 2: ", seq[exclusion_end+1:])
      # print("New sequence: ", new_sequence) # Gives None!
      sequences[seq_index] = new_sequence
    
[print("Sequence {}: {}".format(i,sequence)) for i, sequence in enumerate(sequences)]

['(', 'clean', '(', 'cut', '(', 'with', 'H', ')', 'tomato', ')', ')'] 1
Found H
H index 6, action index 3, object index 8, exclusion end 9
['(', 'fry', '(', 'boil', '(', 'one', '(', 'clean', '(', 'boil', '(', 'with', 'H', ')', 'chicken', ')', ')', ')', ')', ')'] 5
Found H
H index 12, action index 9, object index 14, exclusion end 15
['(', 'clean', '(', 'one', '(', 'three', '(', 'fry', '(', 'fry', '(', 'with', 'H', ')', '(', 'cut', '(', 'steak', 'then', 'onion', ')', ')', ')', ')', ')', ')', ')'] 8
Found H
H index 12, action index 9, object index 17, exclusion end 22
['(', 'clean', '(', '(', 'steak', 'then', '(', '(', 'cut', '(', 'onion', 'then', 'onion', ')', ')', 'then', 'tomato', ')', ')', 'after', '(', 'cut', '(', '(', 'clean', '(', 'with', 'H', ')', 'steak', ')', 'then', 'onion', ')', ')', ')', ')'] 12
Found H
H index 27, action index 24, object index 29, exclusion end 30
['(', 'clean', '(', 'cut', '(', 'with', 'H', ')', 'steak', ')', ')'] 19
Found H
H index 6, action index 3, obje

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [343]:
import copy
# Filter out the brackets:
input_sequences = copy.deepcopy(sequences)
for seq_index, seq in enumerate(input_sequences):
  original_length = len(seq)
  for token_index, token in enumerate(seq[::-1]):
    # print(token)
    # print(token_index)
    if (token == '(') or (token == ')'):
      removal_index = original_length - 1 - token_index
      # print("Removal index: ", removal_index, len(input_sequences[seq_index]))
      del input_sequences[seq_index][removal_index]
[print("Sequence {}: {}".format(i,sequence)) for i, sequence in enumerate(input_sequences)]
# [print("Sequence {}: {}".format(i,sequence)) for i, sequence in enumerate(sequences)] # Make sure original list not overwritten

Sequence 0: ['clean', 'pork']
Sequence 1: ['clean', 'cut', 'with', 'veg-knife', 'tomato']
Sequence 2: ['clean', 'grill', 'cut', 'chicken']
Sequence 3: ['boil', 'pork']
Sequence 4: ['fry', 'one', 'onion', 'after', 'one', 'onion']
Sequence 5: ['fry', 'boil', 'one', 'clean', 'boil', 'with', 'soup-pot', 'chicken']
Sequence 6: ['fry', 'pork']
Sequence 7: ['fry', 'tomato']
Sequence 8: ['clean', 'one', 'three', 'fry', 'fry', 'with', 'frying-pan', 'cut', 'steak']
Sequence 9: ['grill', 'pork', 'then', 'two', 'cut', 'one', 'cut', 'pork']
Sequence 10: ['fry', 'steak']
Sequence 11: ['cut', 'boil', 'chicken']
Sequence 12: ['clean', 'steak', 'then', 'cut', 'onion', 'then', 'onion', 'then', 'tomato', 'after', 'cut', 'clean', 'with', 'fillet-knife', 'steak', 'then', 'onion']
Sequence 13: ['boil', 'fry', 'one', 'steak']
Sequence 14: ['cut', 'onion']
Sequence 15: ['boil', 'onion']
Sequence 16: ['fry', 'pork']
Sequence 17: ['grill', 'one', 'tomato']
Sequence 18: ['cut', 'chicken']
Sequence 19: ['clean', 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [344]:
dicti = {'a': 2, 'b': 3}
print(dicti['a'])
b = [0, 1, 2, 3, 4]
b[1:4] = ['hallo']
print(b)

2
[0, 'hallo', 4]


In [0]:
seqs = copy.deepcopy(sequences)


In [351]:
# Let's try this again. We use 'sequences', which has brackets for each PCFG rule. We interpret each rule independently first, 
# then we combine everything that is nested (bottom up parsing)
import time
interpretations = []
number_dict = {'one': 1, 'two': 2, 'three': 3}
new_sequences = []
previous_first_token = ''
previous_last_opening_bracket_index = -1

for seq_index, seq in enumerate(seqs): # TODO: Remove slicing

  old_sequence = []
  sequence = seq
  new_sequence = seq
  print('\n\n')
  while new_sequence != old_sequence:
    new_scope = []
    # time.sleep(1)
    old_sequence = copy.deepcopy(new_sequence)
    # Now, just update the sequence every time, and loop over it
    opening_brackets = [token == '(' for token in new_sequence]
    closing_brackets = [token == ')' for token in new_sequence]

    print("Seq: ", new_sequence)
    # print("Opening brackets: ", opening_brackets)
    # print("Closing brackets: ", closing_brackets)
    # Following is for working from the right:
    if not any(opening_brackets):
      # print("No more opening brackets..")
      break
    last_opening_bracket_index = len(opening_brackets) - 1 - opening_brackets[::-1].index(True) # Check if this is not previous minus 1, because then we're looking at te same word again
    # print("Last opening bracket: ", last_opening_bracket_index)
    # Corresponding closing bracket is first closing bracket after it:
    closing_bracket_index = last_opening_bracket_index + closing_brackets[last_opening_bracket_index:].index(True)

    scope = new_sequence[last_opening_bracket_index+1:closing_bracket_index]
    first_token = scope[0]
    # scope = ['(', 'fry', '(', 'steak', 'after', 'chicken', ')']
    print("Scope: ", scope)


    con_list = [token in connectives for token in scope]
    act_list = [token in actions for token in scope]
    obj_list = [token in objects for token in scope]
    num_list = [token in numbers for token in scope]
    new_scope = scope


    if last_opening_bracket_index == previous_last_opening_bracket_index: # This prevents words that show up at the start of a scope for the second time from being treated as the acting word again (scope creep ;)
      new_sequence[last_opening_bracket_index:closing_bracket_index+1] = new_scope

    # if any(con_list):
    if scope[0] in connectives:
      print("found con")
      and_list = [token == 'and' for token in scope]
      after_list = [token == 'after' for token in scope]
      if any(after_list):
        print("Found after: ", scope)
        after_index = after_list.index(True)
        new_scope = scope[:after_index] + ['then'] + scope[after_index+1:]
        print("Neues scope: ", new_scope)
      if any(and_list):
        print("Found and: ", scope)
        and_index = and_list.index(True)
        new_scope = scope[:and_index] + ['then'] + scope[and_index+1:]
        print("Neues scope2: ", new_scope)
    # if any(num_list): # Actions will be disbanded in the scope of the action, but numbers can also encapsulate actions
    if scope[0] in numbers:
      print('found num')
      # Select everything between the number and the end of the scope (=objects)
      # Find value for the number
      # Replace the complete scope by objects then objects, N times
      number_index = num_list.index(True)
      number = scope[number_index]
      # print("Number: ", number)
      repeat_scope = scope[number_index+1:]
      # print("Repeat scope: ", repeat_scope)
      factor = number_dict[number]
      # print("Factor: ", factor)
      # print("Added words: ", (['then'] + repeat_scope)*2)
      new_scope = repeat_scope
      if factor>1:
        new_scope = new_scope + (['then'] + repeat_scope)*(factor-1)
      # print("Updated numbered scope: ", new_scope)
    # if any(act_list):
    if scope[0] in actions:
      print('found act')
      # Here, we insert the action word in front of each object, if there are multiple objects
      # TODO: When all modifications are done, we insert the tools
      # print("Action scope: ", scope)
      action = scope[act_list.index(True)]
      # print("Action: ", action)
      # print("Object list: ", obj_list)
      orig_length = len(obj_list)
      first_object_index = obj_list.index(True)
      new_scope = scope
      for token_index, boolean in enumerate(obj_list[:first_object_index:-1]):
        real_index = orig_length - 1 - token_index
        if boolean:
          # print("A: ", (scope[real_index-1] in tool_list))
          # print("B: ", (scope[real_index-1] in actions))
          # print("Scope before stall: ", scope)
          conn_index = real_index - [token in connectives for token in scope[real_index::-1]].index(True) # This should find the last connective before the object
          # Then we insert the action after the connective (except for the first one)!
          #
          # if not ( (scope[real_index-1] in tool_list) or (scope[real_index-1] == action)):
          # print("Scope until object: ", scope[:real_index])
          new_scope[conn_index+1:conn_index+1] = [action] # We should insert action before the last action before each object
            # And this insertion should be done for each object, except the first one. -- It essentially should be done
            # after the & before the object. So we should find each &!
          print("Inserted action: ", new_scope)
    # else:
    #   print("Did nothing..")
    #   new_scope = scope
    # if (new_scope[0] == '(' and new_scope[-1] == ')'):
    #   # scope = new_scope[1:-2]
    #   print("Scope B: ", scope)
    # else:
    #   # print('scope A: ', scope)
    #   # scope = new_scope
    #   print()
    print("Scope before new sequence: ", new_scope)
    # time.sleep(1)
    new_sequence[last_opening_bracket_index:closing_bracket_index+1] = new_scope
    # previous_first_token = scope[0]
    previous_last_opening_bracket_index = last_opening_bracket_index
    # print("new sequence: ", new_sequence)
    # print("stop: ", new_sequence == old_sequence)

  # Now, we set up a loop to insert the tools
  old_sequence = []
  
  print("Time to insert tools..")


  while new_sequence != old_sequence:
    # inverted_action_index = [token in actions for token in new_sequence]
    old_sequence = copy.deepcopy(new_sequence)
    object_list = [token in objects for token in new_sequence]
    original_length = len(object_list)
    for inv_action_index, action in enumerate(new_sequence[::-1]):


      # print("Actio/n: ", action)
      # print("Actions: ", actions)


      if action in actions: 


        # print("found action: ", action)
        
        real_index = original_length - 1 - inv_action_index
        if new_sequence[real_index+1] != 'with': #this is true if tool has not been inserted
          #select next object in sequence
          # object_list = [token in objects for token in new_sequence]
          next_object_index = real_index + object_list[real_index:].index(True)
          next_object = old_sequence[next_object_index] # This is the object belonging to the action and object pair
        
          # print(action, next_object, next_object_index, old_sequence, object_list, real_index)
        
          tool = tools[action][next_object]
          # print("Need to insert ", tool)
          # problem - indexes do not align between object list and new sequence
          new_sequence[real_index+1:real_index+1] = ['with', tool]
          print("Tool inserted: ", new_sequence)

  interpretations.append(new_sequence)

  #TODO: Store new sequences

  # # Following is for working from the left:
  # first_opening_bracket 
  # break




Seq:  ['(', 'clean', 'pork', ')']
Scope:  ['clean', 'pork']
found act
Scope before new sequence:  ['clean', 'pork']
Seq:  ['clean', 'pork']
Time to insert tools..
Tool inserted:  ['clean', 'with', 'fillet-knife', 'pork']



Seq:  ['(', 'clean', '(', 'cut', '(', 'with', 'veg-knife', ')', 'tomato', ')', ')']
Scope:  ['with', 'veg-knife']
Scope before new sequence:  ['with', 'veg-knife']
Seq:  ['(', 'clean', '(', 'cut', 'with', 'veg-knife', 'tomato', ')', ')']
Scope:  ['cut', 'with', 'veg-knife', 'tomato']
found act
Scope before new sequence:  ['cut', 'with', 'veg-knife', 'tomato']
Seq:  ['(', 'clean', 'cut', 'with', 'veg-knife', 'tomato', ')']
Scope:  ['clean', 'cut', 'with', 'veg-knife', 'tomato']
found act
Scope before new sequence:  ['clean', 'cut', 'with', 'veg-knife', 'tomato']
Seq:  ['clean', 'cut', 'with', 'veg-knife', 'tomato']
Time to insert tools..
Tool inserted:  ['clean', 'with', 'water', 'cut', 'with', 'veg-knife', 'tomato']



Seq:  ['(', 'clean', '(', 'grill', '(', 'cut

In [0]:
# import copy

# interpretations = []
# number_dict = {'one': 1, 'two': 2, 'three': 3}
# new_sequences = []

# old_action_scope = []

# for seq_index, seq in enumerate(sequences):
#   # print(seq)
#   old_sequence = []
#   new_sequence = []
#   # There are two things to be done for interpretation.
#   # 1. Insert 'with tool' for each action.
#   # 2. Split up action to multiple actions if it works on multiple objects
#   # Let's start with two
#   # We have three cases:
#   # 1 - Multiple objects are linked by a connective (&)
#   # 2 - Objects are multiplied by a number (N)
#   # 3 - An action only has a single object it operates on
#   # Approach: isolate the scope of each action. Then add interpretation to new sequence, sequentially
#   # First action scope ends before second action in the sequence (sequence always starts with action)
#   # Second action scope is from second action up until third action, etc.
#   # Last action scope is from last action up until end of sequence.
#   # Problem: actions can be nested..
#   # What if we look from right to left? This should work, since the last action word
#   # before the object indicates the action working on the object
#   # Then we can simply create a list of objects for this scope, taking number and order (after, and) into account

#   # Alternative idea: insert brackets into sentence to decompose (potentially during generation?)
#   action_list = [token in actions for token in seq[::-1]]
#   # Note: we are not extracting nested actions. 
#   # TODO: Workaround: make sure that actions (with multipliers) nested in an action are taken into account
#   # this means that the 'endpoints' of action scopes should be remembered. This can be done by making a list of these endpoints, and updating this list
#   # every time an interpretation of an action scope is inserted into the new sequence
#   connective_list = [token in connectives for token in seq[::-1]]
#   # action_edges = [token in actions for token_index, token in enumerate(seq) if seq[token_index-1] in connectives]
#   action_edges = [token in actions and seq[token_index-1] in connectives for token_index, token in enumerate(seq)]
#   print("seq: ", seq)
#   print("action edges: ", action_edges)
#   while old_sequence != sequence:
#     old_sequence = sequence
#     if any(action_list):
#       inv_action_index = action_list.index(True)
#     else:
#       continue #TODO: What do we do if no actions?
#     action_scope_start = len(seq)-inv_action_index-1
#     # print("Action list: ", action_list)
    
#     if any(action_edges[action_scope_start:]):
#       end_index = action_edges[action_scope_start:].index(True) # The action sequence should end where the next connective + action is
#     else:
#       end_index = len(seq)
#     # connective_list = [token in connectives for token in seq[::-1]] # we want to find the next connective



#     print("Action scope start: ", action_scope_start)
#     action_scope = seq[action_scope_start:end_index] # We should remove this from the sequence, and add the new action_scope to a new sequence, iteratively
#     print("seq: ", seq)
#     print("Action scope: ", action_scope)
#     print("Action list: ", action_list)
#     action_list[inv_action_index] = False
#     # print("Action list: ", action_list)
#     # print("Action index: ", action_scope_start)
#     object_list = [token in objects for token in action_scope]
#     number_list = [token in numbers for token in action_scope]
#     con_list = [token in connectives for token in action_scope]
    
#     # So we are looking at each action now. We should just add the action after each 'then'. So first process the numbers and connectives, then add the action after each 'then'
#     main_action = action_scope[0]
#     if any(number_list): # This should be first one
#       old_action_scope = []
#       while old_action_scope != action_scope and any(number_list):
#         old_action_scope = action_scope
#         # print("Found a number!: ", action_scope)
#         # Select the number and the object after it. Numbers can also be nested, so we need
#         # to select the number and the first object after it as a scope.
#         # Then find all numbers in that scope, and multiply them. 
#         # cut two fry tomato and potato --> cut two (fry tomato and potato) --> cut fry tomato and cut fry potato and cut fry tomato and cut fry tomato

#         # So we need to select 
#         if any(number_list):
#           first_number_index = number_list.index(True)
#           # Select object it belongs to, is always the first object to its right
#           object_index = first_number_index + object_list[first_number_index:].index(True)
#           last_number_index = object_index - 1 - number_list[object_index:first_number_index:-1].index(True)
#           print("First number index: ", first_number_index)
#           print("Last number index: ", last_number_index) # CHECK!
#           all_numbers = action_scope[first_number_index:last_number_index]
#           print("All numbers: ", all_numbers)
#           factor = 1
#           for number in all_numbers:
#             factor *= number_dict[number]
#           # print("factor: ", factor)
#           new_section = [action_scope[object_index]] + ['then', action_scope[object_index]]*(factor-1)
#           insert_section = ['then', main_action, action_scope[object_index]]*(factor-1)
#           action_scope[object_index+1:object_index+1] = insert_section
#           del action_scope[first_number_index] 
#           number_list[first_number_index:last_number_index] = [False]*(last_number_index-first_number_index+1)
#           action_edges[object_index+1:object_index+1] = [False]*len(insert_section) 
#         # print("New action scope: ", action_scope)
#         # print("New section: ", new_section)
#       # if sum(object_list) == 1:
#       #   continue
#       #   # print("only a single object in this scope!")
#       #   obj_index = object_list.index(True)
#       #   if action_scope[obj_index-1] not in tool_list:
#       #     # print("We need to insert the tool!") # This could be done with a function
#       #     obj = action_scope[obj_index]
#       #     action = action_scope[0]
#       #     tool = tools[action][obj]
#       #     action_scope[1:1] = ['with', tool]
#       #     print("New action scope: ", action_scope)
#     if any(con_list): # This should be generalized for all numbers of objects, and then be final step
#       old_action_scope = []
#       while old_action_scope != action_scope:
#         old_action_scope = action_scope
#         # print("more than one object in scope!")
#         # Find an and/then/after, then insert the action before it, and insert the tool. 
#         # If there is an 'after', we should also reverse the order. This is most easily done before inserting the tool
#         # con_list = [token in connectives for token in action_scope]
#         # con_indices = [i for i, token in enumerate(action_scope) if (token in connectives)]
#         # con_indices = [token if (token in connectives) for token in action_scope ] 
#         # print("con indices: ",con_indices) # continue with a check whether token is 'after' or not, in which case we reverse the objects.
#         # 'after' has a local effect: "A after B and D" = "B then A then D"
#         and_list = [token == 'and' for token in action_scope]
#         after_list = [token == 'after' for token in action_scope]
#         while any(after_list):
#         # if any(after_list): # If there is an after in the action_scope, select everything between the previous connective or action as rhs, 
#         # everything between 'after' and the next connective as lhs, and replace it with "lhs then rhs"
#           # connective_index_before

#           after_index = after_list.index(True)
#           if any(con_list[after_index-1::-1]):
#             connective_index_before = after_index - 1 - con_list[after_index-1::-1].index(True)
#           else:
#             connective_index_before = 0
#           if any(con_list[after_index+1:]):
#             connective_index_later = after_index + 1 +  con_list[after_index+1:].index(True)
#           else:
#             connective_index_later = len(action_scope)
#           # print(connective_index_before)
#           # print(connective_index_later)
#           lhss = action_scope[connective_index_before+1:after_index]
#           rhss = action_scope[after_index+1:connective_index_later]
#           # print("lhss: ", lhss, "\n rhss: ", rhss )
#           new_section = rhss + ['then'] + lhss
#           # print("Swapped sections around after to:" ,new_section)
#           action_scope[connective_index_before+1:connective_index_later] = new_section
#           after_list = [token == 'after' for token in action_scope]
#           # print("After list after op: ", after_list)
#         while any(and_list):
#           and_index = and_list.index(True)
#           action_scope[and_index] = 'then'
#           and_list[and_index] = False

#         # print("Action scope before insertion: ", action_scope)
#     new_object_list = [token in objects for token in action_scope]# We need to create a list of all objects. Then one by one add the tool, if necessary. Then cross it off of the list. We add the tools
#     # starting at the rightmost object. Then as we go along, the action_scope sequence will get longer, but the objects that we still need to consider will stay at the same index
#     original_length = len(new_object_list)
#     # Loop over whatever is below
#     old_scope = []
#     # print("Action scope A: ", action_scope)
#     while old_scope != action_scope:
#       old_scope = copy.copy(action_scope)
#       if any(new_object_list):
#         obj_index = original_length - 1 - new_object_list[::-1].index(True)
#       else:
#         # print("Breaking : ", action_scope)
#         break
#       print("Trying to fill in tool..")
#       print("Object index: ", obj_index)
#       # if action_scope[obj_index-1] not in tool_list:
#       #   # print("Actually filling in")
#       #   # So we insert:
#       #   # The action
#       #   # The 'with tool'
#       #   # Both before the object
#       #   print("We need to insert the tool!") # This could be done with a function
#       #   print("Object: ", action_scope[obj_index])
#       #   # print("Object index: ", obj_index)
#       #   obj = action_scope[obj_index]
#       #   action = action_scope[0]
#       #   tool = tools[action][obj]
#       #   if action_scope[obj_index-1] not in actions: # Check if action actually in front of object
#       #     action_scope[obj_index:obj_index] = [action, 'with', tool]
#       #   else:
#       #     action_scope[obj_index:obj_index] = ['with', tool]
#       #   # print("New action scope: ", action_scope)
#       #   new_object_list[obj_index] = False
#       if action_scope[obj_index-2] != 'with':
#         print("Weird one: ", action_scope)
#         # So we insert:
#         # The action
#         # The 'with tool'
#         # Both before the object
#         # print("We need to insert the tool!") # This could be done with a function
#         print("Action scope C: ", action_scope)
#         print("Object: ", action_scope[obj_index])
#         print("Action: ", action)
#         # print("Object index: ", obj_index)
#         obj = action_scope[obj_index]
#         action = action_scope[0]
#         tool = tools[action][obj]
#         if action_scope[obj_index-1] not in actions: # Action obviously not in front of object.
#           action_scope[obj_index:obj_index] = [action, 'with', tool]
#         else:
#           action_scope[obj_index:obj_index] = ['with', tool]
#         print("New action scope: ", action_scope)
#       new_object_list[obj_index] = False
          
#         # obj_index = original_length - 1 - new_object_list[::-1].index(True)
#         # print("Object index: ", obj_index)

#         # Now, we insert the action before each of the objects, and insert the with tool
#         # print("New action scope:" ,action_scope)


#     if new_sequence != old_sequence:
#       new_sequence[0:0] = action_scope 
#       sequence = new_sequence
#     # print("new_sequence: ", new_sequence)
#     # print("Final action scope: ", action_scope)
#   new_sequences.append(sequence)
#   print("Final sequence: ", sequence)



#   # Find last action index, and then remove the scope before it from seq and store in action_scope
#   # First item in seq/action list should always be an action
#   # To insert multiple values into middle of list: https://stackoverflow.com/questions/3748063/what-is-the-syntax-to-insert-one-list-into-another-list-in-python
#   # if any(action_list):
#   #   action_index = action_list.index(True)
#   # else:
#   #   action_index = 
#   # print(action_index)

#   # TODO: Write function to save original sequences and interpretations, plus metadata
#   # TODO: Filtering etc

In [0]:
# action_scope = ['cut', 'chicken', 'and', 'two', 'two', 'tomato', 'after', 'two', 'potato', 'then', 'beef'] 
# print("more than one object in scope!")
# # Find an and/then/after, then insert the action before it, and insert the tool. 
# # If there is an 'after', we should also reverse the order. This is most easily done before inserting the tool
# con_list = [token in connectives for token in action_scope]
# con_indices = [i for i, token in enumerate(action_scope) if (token in connectives)]
# # con_indices = [token if (token in connectives) for token in action_scope ] 
# print("con indices: ",con_indices) # continue with a check whether token is 'after' or not, in which case we reverse the objects.
# # 'after' has a local effect: "A after B and D" = "B then A then D"
# after_list = [token == 'after' for token in action_scope]
# number_list = [token in numbers for token in action_scope]
# object_list = [token in objects for token in action_scope]
# number_dict = {'one': 1, 'two': 2, 'three': 3}
# if any(number_list):
#   # Select the number and the object after it. Numbers can also be nested, so we need
#   # to select the number and the first object after it as a scope.
#   # Then find all numbers in that scope, and multiply them. 
#   first_number_index = number_list.index(True)
#   # Select object it belongs to, is always the first object to its right
#   object_index = first_number_index + object_list[first_number_index:].index(True)
#   all_numbers = action_scope[first_number_index:object_index]
#   print("All numbers: ", all_numbers)
#   factor = 1
#   for number in all_numbers:
#     factor *= number_dict[number]
#   print("factor: ", factor)
#   new_section = [action_scope[object_index]] + ['then', action_scope[object_index]]*(factor-1)
#   print("New section: ", new_section)

In [0]:
# Store in data object
class Datapoint:
  def __init__(self, input_sequence, original_sequence, interpretation):
    self.input_sequence = input_sequence
    self.original_sequence = original_sequence
    self.interpretation = interpretation
    self.interpretation_length = len(interpretation)
    self.input_length = len(input_sequence)
    self.nesting = sum([token == '(' for token in original_sequence])# Number of opening brackets

    # Count depth of deepest nesting by counting largest number of consecutive opening brackets 
    max_depth = 0
    depth = 0
    for token_index, token in enumerate(original_sequence):
      if token == '(' and original_sequence[token_index-1] == '(':
        depth +=1
      else:
        max_depth = max(max_depth, depth)
        depth = 0 # if not consecutive, reset counting
    
    self.depth = max_depth


class Dataset:
  def __init__(self):
    self.datapoints = []
    self.input_sequences = []
    self.original_sequences = []
    self.interpretations = []
    self.interpretation_lengths = []
    self.input_lengths = []
    self.nestings = []
    self.depths = []

  def add_datapoint(self, input_sequence, original_sequence, interpretation):
    current_datapoint = Datapoint(input_sequence, original_sequence, interpretation)
    self.datapoints.append(current_datapoint)
    self.input_sequences.append(input_sequence)
    self.original_sequences.append(original_sequence)
    self.interpretations.append( interpretation )
    self.input_lengths.append( current_datapoint.input_length )
    self.nestings.append( current_datapoint.nesting )
    self.depths.append( current_datapoint.depth )



  def reinitialize(self, dataset):
    # This function can be used to extract the data from an old dataset format, if one of the internal functions has been changed
    for datapoint in dataset.datapoints:
      self.add_datapoint(datapoint.input_sequence, datapoint.original_sequence, datapoint.interpretation)
  


In [0]:
our_dataset = Dataset()
print(len(interpretations))
for sequence_index in range(len(sequences)):
  our_dataset.add_datapoint(input_sequences[sequence_index], sequences[sequence_index], interpretations[sequence_index])
  print(sequences[sequence_index])
  print(interpretations[sequence_index])
  print(input_sequences[sequence_index])