<a href="https://colab.research.google.com/github/lennertjansen/nlp2/blob/master/project1/datageneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount notebook to drive

In [7]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [10]:
%cd drive/My\ Drive/nlp2/code
!ls

/content/drive/.shortcut-targets-by-id/1QXwx2THcNLhcOFb0uOdUp12fL_lxsei0/nlp2/code
'Colab Tutorial'


In [0]:
import numpy as np

In [0]:
"""
 To generate our dataset, we need a few things:
 1. Grammar rules (So a PCFG) with all grammar rules
 2. Lexical rules (a vocabulary)
 3. A generation mechanism, that probabilistically samples how to fill in each instance, and that can stop. Should also be able to give metadata to each datapoint 
 4. Potentially a function that checks if the grammar is correct (could also be done by hand)
 5. A translation function, that converts each generated sample into its correct interpretation. This needs interpretation rules, and a lookup table
 Notes:
 - Use nested dict for lookup table
 """

In [0]:
class PCFG_rule:
  def __init__(self, lhs, rhs, prob):
    self.lhs = lhs
    self.rhs = rhs
    self.prob = prob
  
  def __str__(self):
    if isinstance(self.rhs, list):
      return ("({}) {} --> {}".format( self.prob, ' '.join(self.lhs), ' '.join(self.rhs) ))
    else:
      return ("({}) {} --> {}".format( self.prob, ' '.join(self.lhs), self.rhs ))

class PCFG:
  def __init__(self):
    self.rules = {}

  def add(self, lhs, rhs, prob):
    rule = PCFG_rule(lhs, rhs, prob)
    if isinstance(lhs,list):
      lhs = tuple(lhs) # Tuple is hashable
    if lhs in self.rules.keys():
      self.rules[lhs].append(rule)
    else:
      self.rules[lhs] = [rule]

  def select(self, lhs):
    lhs = tuple(lhs)
    return self.rules[lhs]

  def sample(self, lhs):
    # print(lhs, len(lhs))
    assert not ((isinstance(lhs, list) or isinstance(lhs, tuple)) and len(lhs)>1), "Only single words or non-terminals can be filled in."
    if isinstance(lhs, list):
      lhs = tuple(lhs)
    # print("Sampling for ", lhs)
    if not self.rules.get(lhs):
      return lhs, True
    rules = self.rules.get(lhs)
    # print(rules)
    probs = [rule.prob for rule in rules]
    # print("probabilities: ", probs)
    number_of_options = len(rules)

    choice_index = np.random.choice(a = number_of_options, p = probs)
    # print("Selected option: ", rules[choice_index].rhs)
    return rules[choice_index].rhs, False

  def generate(self, sequence):
    # print("Generating for starting sequence ", sequence)
    finished = False
    while not finished:
      finished_list = [False]*len(sequence)
      new_sequence = []
      for i, lhs in enumerate(sequence):
        rhs, finished_list[i] = ruleset.sample(lhs)
        if isinstance(rhs, list):
          new_sequence.extend(rhs)
        elif isinstance(rhs, str):
          new_sequence.append(rhs)
      sequence = new_sequence
      # print("New sequence: ", sequence)
      if all(finished_list):
        finished = True
    return sequence
      


  def __str__(self):
    string = ''
    for key in self.rules.keys():
      for item in self.rules[key]:
        # print(item)
        string += item.__str__()
        # print(item.__str__())
        string += '\n'
    return string

In [133]:
# 1: grammar rules. We will encode this in an array? dict? tuple? 
# dict - tuple: prob (where tuple - [lhs , rhs] for lhs --> rhs)
# Better option: python array, with [lhs, rhs, prob] for each cfg rule
# Where lhs is a list of each character in lhs, and rhs same for rhs
# Actually, best option is to use a dict. Since we will be generating, the lhs should be the key.
# So: dict of {lhs: [[rhs1, prob1], [rhs2, prob2]]}
# rule = PCFG_rule()
ruleset = PCFG()

# Non-terminal rules:

lhs = 'S' # valid sequence is an object: S --> O
rhs = ['O']
prob = 1
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> A O
rhs = ['A', 'O']
prob = 0.25
ruleset.add(lhs, rhs, prob)

lhs = 'O' # O --> A T O
rhs = ['A', 'T', 'O']
prob = 0.1
ruleset.add(lhs, rhs, prob)

# lhs = 'O' # O --> N O
# rhs = ['N', 'O']
# prob = 0.05
# ruleset.add(lhs, rhs, prob)

lhs = 'A' # A --> A N

lhs = 'O' # O --> O & O
rhs = ['O', '&', 'O']
prob = 0.1
ruleset.add(lhs, rhs, prob)


lhs = 'T' # T --> W H
rhs = ['W', 'H']
prob = 1
ruleset.add(lhs, rhs, prob)

# lhs = ['A', 'O'] # A O --> A O & O
# rhs = ['A', 'O', '&', 'O']
# prob = 0.4
# ruleset.add(lhs, rhs, prob)


print('All non-terminal rules: \n', ruleset)

# Problem:
# cut tomato and cut steak should be a valid S
# S --> A O & A O 
# These should also be possibly nested in another action, so
# A O --> A A O & A O
# This reduces to O --> A O & A O, which reduces to O --> O & O 
# Above is true since O --> A O
# O --> O & O should not be possible (why not?)
# This causes line 47 to be ambiguous - unless we use a 'then' word in addition to and
# 'then' would indicate the end of the scope of an action, where 'and' would include the next object

All non-terminal rules: 
 (1) S --> O
(0.25) O --> A O
(0.1) O --> A T O
(0.05) O --> N O
(0.1) O --> O & O
(1) T --> W H



In [134]:
# All lexical rules:

# Object rules:
lhs = 'O'
objects = ['tomato', 'steak', 'onion', 'potato', 'chicken', 'pork']
obj_probs = [0.1, 0.1, 0.1, 0.1, 0.05, 0.05] # We don't like chicken and pork
# Note: will the eventual model be largely carnivorous or herbivorous?
for rhs, prob in zip(objects, obj_probs):
  ruleset.add(lhs, rhs, prob)
# print(ruleset)

# Action rules:
lhs = 'A'
actions = ['cut', 'fry', 'grill', 'clean', 'boil']
act_probs = [0.3, 0.2, 0.2, 0.2, 0.1]
for rhs, prob in zip(actions, act_probs):
  ruleset.add(lhs, rhs, prob)

# Number rules:
lhs = 'N'
numbers = ['one', 'two', 'three', 'four', 'five']
num_probs = [0.4, 0.3, 0.1, 0.1, 0.1]
for rhs, prob in zip(numbers, num_probs):
  ruleset.add(lhs, rhs, prob)

# Connective rules:
lhs = '&'
connectives = ['and', 'after', 'then']
conn_probs = [0.4, 0.2, 0.4]
for rhs, prob in zip(connectives, conn_probs):
  ruleset.add(lhs, rhs, prob)

# With rule:
lhs = 'W'
w = ['with']
w_prob = 1
ruleset.add(lhs, w, w_prob)

# Tool rules:
# lhs = ['H']
# tools = ['vegknife', 'meatknife', 'fryingpan', 'grill', 'boil', 'season']
# conn_probs = [0., 0.2, 0.4]
# for rhs, prob in zip(connectives, conn_probs):
#   ruleset.add(lhs, rhs, prob)
# How do we make sure these only appear in the correct context?
# Probably easiest to filter the wrong ones out?
# Or use context based rules for generation. Or even substitute actions as 'action with tool'
# strings? Probably context based rules is best, based on intuition.
# Probably we should implement the tools in a different function than in the PCFG

print(ruleset)

(1) S --> O
(0.25) O --> A O
(0.1) O --> A T O
(0.05) O --> N O
(0.1) O --> O & O
(0.1) O --> tomato
(0.1) O --> steak
(0.1) O --> onion
(0.1) O --> potato
(0.05) O --> chicken
(0.05) O --> pork
(1) T --> W H
(0.3) A --> cut
(0.2) A --> fry
(0.2) A --> grill
(0.2) A --> clean
(0.1) A --> boil
(0.4) N --> one
(0.3) N --> two
(0.1) N --> three
(0.1) N --> four
(0.1) N --> five
(0.4) & --> and
(0.2) & --> after
(0.4) & --> then
(1) W --> with



In [135]:
# Now, part two: generation
# We will generate grammatically correct sequences, where we only leave the tools not filled in.
# We always start from an S, and then randomly pick a way to fill it in, recursively
for i in range(20):
  sequence = ['S']
  sequence = ruleset.generate(sequence)
  print("Sequence {}: {}".format(i,sequence))

Sequence 0: ['onion', 'then', 'onion']
Sequence 1: ['onion']
Sequence 2: ['two', 'fry', 'boil', 'five', 'boil', 'steak']
Sequence 3: ['cut', 'with', 'H', 'clean', 'cut', 'chicken', 'and', 'pork']
Sequence 4: ['potato']
Sequence 5: ['steak', 'and', 'steak']
Sequence 6: ['onion', 'and', 'cut', 'one', 'tomato']
Sequence 7: ['cut', 'cut', 'pork']
Sequence 8: ['potato']
Sequence 9: ['tomato']
Sequence 10: ['steak']
Sequence 11: ['clean', 'with', 'H', 'clean', 'onion']
Sequence 12: ['tomato']
Sequence 13: ['cut', 'pork', 'and', 'two', 'tomato']
Sequence 14: ['tomato']
Sequence 15: ['tomato']
Sequence 16: ['potato', 'after', 'steak']
Sequence 17: ['potato']
Sequence 18: ['fry', 'with', 'H', 'potato']
Sequence 19: ['five', 'tomato']
