<a href="https://colab.research.google.com/github/lennertjansen/nlp2/blob/master/project1/datageneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount notebook to drive

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
%cd drive/My\ Drive/nlp2/code
!ls

[Errno 2] No such file or directory: 'drive/My Drive/nlp2/code'
/content/drive/.shortcut-targets-by-id/1QXwx2THcNLhcOFb0uOdUp12fL_lxsei0/nlp2/code
'Colab Tutorial'   OpenNMT-py


In [0]:
import numpy as np

In [1]:
"""
 To generate our dataset, we need a few things:
 1. Grammar rules (So a PCFG) with all grammar rules
 2. Lexical rules (a vocabulary)
 3. A generation mechanism, that probabilistically samples how to fill in each instance, and that can stop. Should also be able to give metadata to each datapoint 
 4. Potentially a function that checks if the grammar is correct (could also be done by hand)
 5. A translation function, that converts each generated sample into its correct interpretation. This needs interpretation rules, and a lookup table
 Notes:
 - Use nested dict for lookup table
 """

'\n To generate our dataset, we need a few things:\n 1. Grammar rules (So a PCFG) with all grammar rules\n 2. Lexical rules (a vocabulary)\n 3. A generation mechanism, that probabilistically samples how to fill in each instance, and that can stop. Should also be able to give metadata to each datapoint \n 4. Potentially a function that checks if the grammar is correct (could also be done by hand)\n 5. A translation function, that converts each generated sample into its correct interpretation. This needs interpretation rules, and a lookup table\n Notes:\n - Use nested dict for lookup table\n '

In [0]:
class PCFG_rule:
  def __init__(self, lhs, rhs, prob):
    self.lhs = lhs
    self.rhs = rhs
    self.prob = prob
  
  def __str__(self):
    if isinstance(self.rhs, list):
      return ("({}) {} --> {}".format( self.prob, ' '.join(self.lhs), ' '.join(self.rhs) ))
    else:
      return ("({}) {} --> {}".format( self.prob, ' '.join(self.lhs), self.rhs ))

class PCFG:
  def __init__(self):
    self.rules = {}

  def add(self, lhs, rhs, prob):
    rule = PCFG_rule(lhs, rhs, prob)
    if isinstance(lhs,list):
      lhs = tuple(lhs) # Tuple is hashable
    if lhs in self.rules.keys():
      self.rules[lhs].append(rule)
    else:
      self.rules[lhs] = [rule]

  def select(self, lhs):
    lhs = tuple(lhs)
    return self.rules[lhs]

  def sample(self, lhs):
    # print(lhs, len(lhs))
    assert not ((isinstance(lhs, list) or isinstance(lhs, tuple)) and len(lhs)>1), "Only single words or non-terminals can be filled in."
    if isinstance(lhs, list):
      lhs = tuple(lhs)
    # print("Sampling for ", lhs)
    if not self.rules.get(lhs):
      return lhs, True
    rules = self.rules.get(lhs)
    # print(rules)
    probs = [rule.prob for rule in rules]
    # print("probabilities: ", probs)
    number_of_options = len(rules)

    choice_index = np.random.choice(a = number_of_options, p = probs)
    # print("Selected option: ", rules[choice_index].rhs)
    return rules[choice_index].rhs, False

  def generate(self, sequence):
    # print("Generating for starting sequence ", sequence)
    finished = False
    while not finished:
      finished_list = [False]*len(sequence)
      new_sequence = []
      for i, lhs in enumerate(sequence):
        rhs, finished_list[i] = ruleset.sample(lhs)
        if isinstance(rhs, list):
          new_sequence.extend(rhs)
        elif isinstance(rhs, str):
          new_sequence.append(rhs)
      sequence = new_sequence
      # print("New sequence: ", sequence)
      if all(finished_list):
        finished = True
    return sequence
      


  def __str__(self):
    string = ''
    for key in self.rules.keys():
      for item in self.rules[key]:
        # print(item)
        string += item.__str__()
        # print(item.__str__())
        string += '\n'
    return string

In [4]:
# 1: grammar rules. We will encode this in an array? dict? tuple? 
# dict - tuple: prob (where tuple - [lhs , rhs] for lhs --> rhs)
# Better option: python array, with [lhs, rhs, prob] for each cfg rule
# Where lhs is a list of each character in lhs, and rhs same for rhs
# Actually, best option is to use a dict. Since we will be generating, the lhs should be the key.
# So: dict of {lhs: [[rhs1, prob1], [rhs2, prob2]]}
# rule = PCFG_rule()
ruleset = PCFG()

# Non-terminal rules:

lhs = 'S' # valid sequence is an object: S --> O
rhs = ['A', 'O']
prob = 1
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> A O
rhs = ['A', 'O']
prob = 0.15
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> A T O
rhs = ['A', 'T', 'O']
prob = 0.05
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> N O
rhs = ['N', 'O']
prob = 0.15
ruleset.add(lhs, rhs, prob)

# lhs = 'A' # A --> A N

lhs = 'O' # O --> O & O
rhs = ['O', '&', 'O']
prob = 0.2
ruleset.add(lhs, rhs, prob)



lhs = 'T' # T --> W H
rhs = ['W', 'H']
prob = 1
ruleset.add(lhs, rhs, prob)

# lhs = ['A', 'O'] # A O --> A O & O
# rhs = ['A', 'O', '&', 'O']
# prob = 0.4
# ruleset.add(lhs, rhs, prob)


print('All non-terminal rules: \n', ruleset)

# Problem:
# cut tomato and cut steak should be a valid S
# S --> A O & A O 
# These should also be possibly nested in another action, so
# A O --> A A O & A O
# This reduces to O --> A O & A O, which reduces to O --> O & O 
# Above is true since O --> A O
# O --> O & O should not be possible (why not?)
# This causes line 47 to be ambiguous - unless we use a 'then' word in addition to and
# 'then' would indicate the end of the scope of an action, where 'and' would include the next object

All non-terminal rules: 
 (1) S --> A O
(0.15) O --> A O
(0.05) O --> A T O
(0.15) O --> N O
(0.2) O --> O & O
(1) T --> W H



In [5]:
# All lexical rules:

# Object rules:
lhs = 'O'
objects = ['tomato', 'steak', 'onion', 'potato', 'chicken', 'pork']
obj_probs = [0.1, 0.1, 0.1, 0.05, 0.05, 0.05] # We don't like chicken and pork
# Note: will the eventual model be largely carnivorous or herbivorous?
for rhs, prob in zip(objects, obj_probs):
  ruleset.add(lhs, rhs, prob)
# print(ruleset)

# Action rules:
lhs = 'A'
actions = ['cut', 'fry', 'grill', 'clean', 'boil']
act_probs = [0.3, 0.2, 0.2, 0.2, 0.1]
for rhs, prob in zip(actions, act_probs):
  ruleset.add(lhs, rhs, prob)

# Number rules:
lhs = 'N'
numbers = ['one', 'two', 'three', 'four', 'five']
num_probs = [0.4, 0.3, 0.1, 0.1, 0.1]
for rhs, prob in zip(numbers, num_probs):
  ruleset.add(lhs, rhs, prob)

# Connective rules:
lhs = '&'
connectives = ['and', 'after', 'then']
conn_probs = [0.4, 0.2, 0.4]
for rhs, prob in zip(connectives, conn_probs):
  ruleset.add(lhs, rhs, prob)

# With rule:
lhs = 'W'
w = ['with']
w_prob = 1
ruleset.add(lhs, w, w_prob)

# Tool rules:
# lhs = ['H']
# tools = ['vegknife', 'meatknife', 'fryingpan', 'grill', 'boil', 'season']
# conn_probs = [0., 0.2, 0.4]
# for rhs, prob in zip(connectives, conn_probs):
#   ruleset.add(lhs, rhs, prob)
# How do we make sure these only appear in the correct context?
# Probably easiest to filter the wrong ones out?
# Or use context based rules for generation. Or even substitute actions as 'action with tool'
# strings? Probably context based rules is best, based on intuition.
# Probably we should implement the tools in a different function than in the PCFG

print(ruleset)

(1) S --> A O
(0.15) O --> A O
(0.05) O --> A T O
(0.15) O --> N O
(0.2) O --> O & O
(0.1) O --> tomato
(0.1) O --> steak
(0.1) O --> onion
(0.05) O --> potato
(0.05) O --> chicken
(0.05) O --> pork
(1) T --> W H
(0.3) A --> cut
(0.2) A --> fry
(0.2) A --> grill
(0.2) A --> clean
(0.1) A --> boil
(0.4) N --> one
(0.3) N --> two
(0.1) N --> three
(0.1) N --> four
(0.1) N --> five
(0.4) & --> and
(0.2) & --> after
(0.4) & --> then
(1) W --> with



In [36]:
# Now, part two: generation
# We will generate grammatically correct sequences, where we only leave the tools not filled in.
# We always start from an S, and then randomly pick a way to fill it in, recursively
sequences = []
for i in range(20):
  sequence = ['S']
  sequence = ruleset.generate(sequence)
  sequences.append(sequence)
  print("Sequence {}: {}".format(i,sequence))

""" Observations 8/04:
Sequence 0: ['cut', 'three', 'onion']
Sequence 1: ['fry', 'fry', 'with', 'H', 'clean', 'five', 'pork', 'then', 'tomato']
Sequence 2: ['cut', 'steak']
Sequence 3: ['clean', 'tomato', 'and', 'clean', 'four', 'tomato']
Sequence 4: ['fry', 'pork', 'and', 'steak', 'and', 'clean', 'one', 'one', 'cut', 'tomato', 'and', 'fry', 'pork']
Sequence 5: ['grill', 'steak', 'and', 'two', 'tomato', 'and', 'onion', 'after', 'chicken']
Sequence 6: ['clean', 'pork']
Sequence 7: ['fry', 'potato']
Sequence 8: ['cut', 'potato']
Sequence 9: ['cut', 'potato']
Sequence 10: ['cut', 'fry', 'fry', 'pork']
Sequence 11: ['fry', 'potato']
Sequence 12: ['grill', 'two', 'potato', 'then', 'tomato']
Sequence 13: ['clean', 'potato']
Sequence 14: ['grill', 'five', 'one', 'tomato']
Sequence 15: ['fry', 'steak', 'then', 'pork']
Sequence 16: ['grill', 'fry', 'fry', 'potato', 'and', 'pork', 'then', 'fry', 'with', 'H', 'potato', 'then', 'steak']
Sequence 17: ['cut', 'steak']
Sequence 18: ['boil', 'grill', 'pork', 'then', 'tomato']
Sequence 19: ['fry', 'tomato', 'and', 'cut', 'one', 'potato', 'then', 'onion']

We see sometimes a double number. This is kind of weird for human language, but should be OK in our task.
Also we see single objects, like sequence 18. This should be interpreted as calling the action (boil grill) on both objects
"""

Sequence 0: ['boil', 'five', 'two', 'fry', 'steak', 'and', 'chicken']
Sequence 1: ['clean', 'two', 'cut', 'tomato']
Sequence 2: ['fry', 'steak']
Sequence 3: ['cut', 'pork']
Sequence 4: ['fry', 'tomato']
Sequence 5: ['grill', 'fry', 'tomato', 'and', 'fry', 'onion', 'then', 'tomato', 'then', 'onion']
Sequence 6: ['clean', 'steak']
Sequence 7: ['cut', 'onion']
Sequence 8: ['fry', 'pork']
Sequence 9: ['cut', 'one', 'potato', 'after', 'steak']
Sequence 10: ['fry', 'potato']
Sequence 11: ['clean', 'pork']
Sequence 12: ['grill', 'cut', 'clean', 'fry', 'pork']
Sequence 13: ['grill', 'potato', 'then', 'onion', 'and', 'steak']
Sequence 14: ['cut', 'two', 'boil', 'with', 'H', 'two', 'fry', 'with', 'H', 'clean', 'onion', 'and', 'steak']
Sequence 15: ['clean', 'fry', 'with', 'H', 'boil', 'with', 'H', 'potato']
Sequence 16: ['grill', 'tomato']
Sequence 17: ['fry', 'one', 'clean', 'potato', 'after', 'five', 'onion']
Sequence 18: ['grill', 'onion']
Sequence 19: ['fry', 'two', 'tomato', 'then', 'steak'

" Observations 8/04:\nSequence 0: ['cut', 'three', 'onion']\nSequence 1: ['fry', 'fry', 'with', 'H', 'clean', 'five', 'pork', 'then', 'tomato']\nSequence 2: ['cut', 'steak']\nSequence 3: ['clean', 'tomato', 'and', 'clean', 'four', 'tomato']\nSequence 4: ['fry', 'pork', 'and', 'steak', 'and', 'clean', 'one', 'one', 'cut', 'tomato', 'and', 'fry', 'pork']\nSequence 5: ['grill', 'steak', 'and', 'two', 'tomato', 'and', 'onion', 'after', 'chicken']\nSequence 6: ['clean', 'pork']\nSequence 7: ['fry', 'potato']\nSequence 8: ['cut', 'potato']\nSequence 9: ['cut', 'potato']\nSequence 10: ['cut', 'fry', 'fry', 'pork']\nSequence 11: ['fry', 'potato']\nSequence 12: ['grill', 'two', 'potato', 'then', 'tomato']\nSequence 13: ['clean', 'potato']\nSequence 14: ['grill', 'five', 'one', 'tomato']\nSequence 15: ['fry', 'steak', 'then', 'pork']\nSequence 16: ['grill', 'fry', 'fry', 'potato', 'and', 'pork', 'then', 'fry', 'with', 'H', 'potato', 'then', 'steak']\nSequence 17: ['cut', 'steak']\nSequence 18: [

In [0]:
# In this cell, we define the lookup table for the tools pertaining to (object, action) pairs. We do this through nested dicts
tools = {
  'cut': {
        'onion': 'veg-knife',
        'tomato': 'veg-knife',
        'steak': 'meat-knife',
        'potato': 'veg-knife',
        'chicken': 'chicken-knife',
        'pork': 'meat-knife'
    },
  'fry': {
      'onion': 'frying-pan',
      'tomato': 'frying-pan',
      'steak': 'frying-pan',
      'potato': 'frying-pan',
      'chicken': 'frying-pan',
      'pork': 'frying-pan'
  },
  'grill': {
      'onion': 'griddle',
      'tomato': 'griddle',
      'steak': 'skillet',
      'potato': 'skillet',
      'chicken': 'griddle',
      'pork': 'skillet'
  },
  'clean': {
      'onion': 'peeler',
      'tomato': 'water',
      'steak': 'fillet-knife',
      'potato': 'peeler',
      'chicken': 'fillet-knife',
      'pork': 'fillet-knife'
  },
  'boil': {
      'onion': 'soup-pot',
      'tomato': 'soup-pot',
      'steak': 'stew-pot',
      'potato': 'stew-pot',
      'chicken': 'soup-pot',
      'pork': 'stew-pot'
  },
  'season': {
      'onion': 'salt',
      'tomato': 'pepper',
      'steak': 'rosemary',
      'potato': 'salt',
      'chicken': 'pepper',
      'pork': 'pepper'
  },
}

In [38]:
""" Next, we need to fill in the right tools. This uses some context. 
 We should loop through each sentence, and wherever there is an open H, fill it in according to the action and next object
 Problem: some actions have tools assigned but operate on multiple objects
 --> actions with tools should operate only on single objects.
 Solution: remove everything until nesting ends for those sentences - so until the & before next action
 - This should actually be encoded in the context free grammar. However, this requires two terms in the lhs, so we circumvent that by filtering like this
 """

action_found = False
H_found = False
with_found = False
remove = False
with_index = 0
remove_ending = 0
# print(sequences)
# for seq_index, seq in enumerate(sequences):
#   print(seq, seq_index)
#   for step in range(seq.count('H')): # Do this for each H in the sequence
#     # We loop through all sequences. Seq is now a list of words, with sometimes an H.
#     for token_index, token in enumerate(seq):
#       if token in actions:
#         action = token
#         action_found = True
#         continue
#       if not ((action_found and (token == 'with')) or with_found):
#         # print("With not found")
#         action_found = False
#       elif token == 'with':
#         with_found = True
#         with_index = token_index
#         # print("With found!", action_found, with_found)
#         continue
#       # if (action_found and with_found):
#       #   print("token after with found and action found: ", token)
#       if action_found and with_found and (token == 'H'):
#         # print("H found")
#         H_index = token_index
#         H_found = True
#         continue
#       if action_found and H_found and (token in objects):
#         print("Found token after H and action in sequence: \n", seq)
#         obj = token
#         print("object: ", obj)
#         print("Action: ", action) # these can be used to look up what's needed. Then we remove the tokens up to the next action.
#         tool = tools[action][obj]
#         print("Tool:", tool)
#         seq[H_index] = tool
#         remove = True
#         remove_start = token_index+1
#         rest_of_sequence = seq[remove_start:]
#         print(seq[remove_start:])
#         remove_end = [index-1 for index, token in enumerate(rest_of_sequence) if (token in actions and rest_of_sequence[index-1] in connectives) ]
#         if len(remove_end)>0:
#           remove_ending = remove_end
#         print(remove_ending)
#         # print("a: ", seq[:remove_start])
#         # print("b: ", seq[remove_start+remove_ending:])
#         sequences[seq_index] = seq[:remove_start].extend(seq[remove_start+remove_ending:])

#         action_found = False
#         H_found = False
#         print(sequences[seq_index])

# OK let's try this again. 
# 1.We need to find the H in the sequence.
# 2. Then we need to find the action that preceeds it
# 3. Then we need to find the object that follows it
# 4. Then we need to remove everything up until the next '& action'
for seq_index, seq in enumerate(sequences):
  print(seq, seq_index)
  if 'H' in seq:
    print("Found H")
    H_index = seq.index('H') # This is the index of the first H in the sequence
    # print([(token in actions) for token in seq[::-1]])
    action_index = len(seq) - 1 - [(token in actions) for token in seq[::-1]].index(True) # Create a boolean list of whether something is an action, then select 
    print(H_index, action_index)
    object_index = H_index + [(token in objects) for token in seq[H_index:]].index(True) # Finds first object after H
    # For exclusion, we need to find the first pair of (& A) after the just found object
    next_action = [(token in connectives and seq[object_index+token_index] in actions) for token, token_index in enumerate(seq[object_index:])]
    # Exclusion end should be end of sequence if there is no next action
    if any(next_action):
      exclusion_end = object_index + next_action.index(True)
    else:
      exclusion_end = len(seq)-1
    # The line above finds the &, so we need to remove everything between that and the object.
    print("H index {}, action index {}, object index {}, exclusion end {}".format(H_index, action_index, object_index, exclusion_end))

    # First, we fill in H:
    obj = seq[object_index]
    action = seq[action_index]
    tool = tools[action][obj]
    seq[H_index] = tool
    new_sequence = seq[0:object_index+1].extend(seq[exclusion_end:])
    # sequences[seq_index] = new_sequence
    # print("New se")
    print("New sequence: ", new_sequence) # Gives None!
    

['boil', 'five', 'two', 'fry', 'steak', 'and', 'chicken'] 0
['clean', 'two', 'cut', 'tomato'] 1
['fry', 'steak'] 2
['cut', 'pork'] 3
['fry', 'tomato'] 4
['grill', 'fry', 'tomato', 'and', 'fry', 'onion', 'then', 'tomato', 'then', 'onion'] 5
['clean', 'steak'] 6
['cut', 'onion'] 7
['fry', 'pork'] 8
['cut', 'one', 'potato', 'after', 'steak'] 9
['fry', 'potato'] 10
['clean', 'pork'] 11
['grill', 'cut', 'clean', 'fry', 'pork'] 12
['grill', 'potato', 'then', 'onion', 'and', 'steak'] 13
['cut', 'two', 'boil', 'with', 'H', 'two', 'fry', 'with', 'H', 'clean', 'onion', 'and', 'steak'] 14
Found H
4 9
H index 4, action index 9, object index 10, exclusion end 12
New sequence:  None
['clean', 'fry', 'with', 'H', 'boil', 'with', 'H', 'potato'] 15
Found H
3 4
H index 3, action index 4, object index 7, exclusion end 7
New sequence:  None
['grill', 'tomato'] 16
['fry', 'one', 'clean', 'potato', 'after', 'five', 'onion'] 17
['grill', 'onion'] 18
['fry', 'two', 'tomato', 'then', 'steak', 'and', 'pork', 'a

In [9]:
dicti = {'a': 2, 'b': 3}
print(dicti['a'])

2


In [30]:
a = [2, 3, 4]
print(a.index(5))

ValueError: ignored