# This notebook containes all the functions and manually created sentences to further generate (Automatically or semi-automatically) the test sets used in *Challenging-SRL*

In [30]:
import numpy as np
import json
import random
from checklist.editor import Editor
from checklist.perturb import Perturb

## Predicate identification test set

In [52]:
#All the sentences generated will be stred in this folder:
path="./Data/Predicate_identification/"

### Generate Data for contraction VERB identification.

Creation semi manual of the contraction data test.  
Some initial predicates that can be subjected to this phenomena are written by the author and Checklist will contract or expand the sentence.  
The list is not exhaustive and can be always be expanded

In [217]:
def create_contractions(data):
    """This function creates contractions from a list of sentences.
    :data is a list of sentences
    Returns a Dict in the shape {'contracted_couples':[[s1,s2]]}
    """
    ret = Perturb.perturb(data, Perturb.contractions)
    return {"contracted_couples":ret.data}

In [218]:
data = ['it\'s a wonderfull day',
        "There's some pesto left",
        "I shoud have tried that as well",#!
        "They will leave the house",
        "That would be creazy",
        "we are here",
        'Mark had not see that coming',
        'She will be a great candidate',
        'Mary is not a nurse.',
        'He\'s gone already',
        "I would like some tea",
        "I would say something to them",
        "I could not eat some food now",
        'We\'ve decided to change house',
        "I must not lose my temper"]

In [240]:
contracted_sentences=create_contractions(data)
len(contracted_sentences),contracted_sentences

(1,
 {'contracted_couples': [["it's a wonderfull day", 'it is a wonderfull day'],
   ["There's some pesto left", 'There is some pesto left'],
   ['They will leave the house', "They'll leave the house"],
   ['That would be creazy', "That'd be creazy"],
   ['we are here', "we're here"],
   ['Mark had not see that coming', "Mark hadn't see that coming"],
   ['She will be a great candidate', "She'll be a great candidate"],
   ['Mary is not a nurse.', "Mary isn't a nurse."],
   ["He's gone already", 'He is gone already'],
   ['I would like some tea', "I'd like some tea"],
   ['I would say something to them', "I'd say something to them"],
   ['I could not eat some food now', "I couldn't eat some food now"],
   ["We've decided to change house", 'We have decided to change house'],
   ['I must not lose my temper', "I mustn't lose my temper"]]})

In [220]:
with open(path+"contracted_sentences.json","w") as f:
    json.dump(contracted_sentences,f)

### Predicate irregular inflections
Semi automatic creation of sentences using a list of irregular inflected verb.  
List was found online / written by the authros. Example sentences are contructed with Large Language Model RoBerta integrated in CheckList library.

In [235]:
def create_inflected_sentences(irregular_inflections):
    """This function creates a dictionary in the shape {label:sentence}. Where label is the inflected verb form.
    :irregular_inflections is a list of the irregular verbs to be used.
    returns Dict
    """
    editor = Editor()
    #This will add the irregular verbs to the lexicon so that we can use them in the template.
    editor.add_lexicon('irr_verb', irregular_inflections,remove_duplicates=True)

    #This will create 1000 samples sentences and their lables will be the irregular verb picked.
    #{mask} is a special token that will be replaced by a random word suggested by the Language model.
    #{fist_name} is a special token that will be replaced by a random first name in the lexicon.
    ret = editor.template('{first_name} {irr_verb} {a:mask} {mask}.',nsamples=100,labels='{irr_verb}')

    #This creates a dictionary in the shape {label:sentence}. Where label is the verb.
    #Carefull, this will be much smaller than the number of samples because some of the sentences will be duplicates.
    inflected_sentences=dict(zip(ret.labels,ret.data))


    return inflected_sentences

In [232]:
irregular_inflections=['Beheld', 'Dwelt', 'Flung', 'Broadcast', 'Clung', 'Dared', 'Fitted', 'Forgave', 'Grinded', 'Hanged', 'Knelt', 'Laid', 'Led', 'Leant', 'Molten', 'Mistook', 'Proved', 'Rose', 'Sawn', 'Sought', 'Sewed', 'Shaven', 'Slit', 'Snuck', 'Span', 'Spoiled', 'Spring', 'Stuck', 'Strode', 'Struck', 'Swung', 'Torn', 'Undertook', 'Vext', 'Wet', 'Wrote']

In [241]:
inflected_sentences=create_inflected_sentences(irregular_inflections)
len(inflected_sentences),inflected_sentences

(35,
 {'Leant': 'Christopher Leant a H __.',
  'Knelt': 'Elizabeth Knelt an Old Woman.',
  'Snuck': 'Katherine Snuck an Old Swan.',
  'Rose': 'Al Rose a L Key.',
  'Forgave': 'Al Forgave a New Thing.',
  'Span': 'Joan Span a New Order.',
  'Strode': 'Lauren Strode a L Stand.',
  'Sewed': 'Christopher Sewed a L 3.',
  'Undertook': 'Steve Undertook an Old Rose.',
  'Led': 'Peter Led a H in.',
  'Dwelt': 'Frances Dwelt a L No.',
  'Swung': 'Paul Swung a H Band.',
  'Sawn': 'Fiona Sawn an Old Girl.',
  'Vext': 'Gary Vext an A Rose.',
  'Flung': 'Frederick Flung a H Bar.',
  'Wet': 'Donald Wet a R Road.',
  'Slit': 'Catherine Slit a H Ranch.',
  'Fitted': 'Henry Fitted a L 0.',
  'Mistook': 'Emily Mistook an In A.',
  'Dared': 'Jennifer Dared an Old Friend.',
  'Spoiled': 'Al Spoiled a L Stand.',
  'Shaven': 'Sue Shaven a H in.',
  'Sought': 'Martha Sought a L Ring.',
  'Molten': 'Bill Molten an Old Thing.',
  'Hanged': 'Susan Hanged an Old Car.',
  'Stuck': 'Frank Stuck a R Post.',
  'Prov

In [238]:
with open(path+"inflected_sentences.json","w") as f:
    json.dump(inflected_sentences,f)