# This notebook containes all the functions and manually created sentences to further generate (Automatically or semi-automatically) the test sets used in *Challenging-SRL*

In [None]:
pip install checklist

In [6]:
import numpy as np
import json
import random
from checklist.editor import Editor
from checklist.perturb import Perturb

## Predicate identification test set

In [101]:
#All the sentences generated will be stred in this folder:
path="./Data/Predicate_identification/"

### Generate Data for contraction VERB identification.

Creation semi manual of the contraction data test.  
Some sentences with predicates that can be subjected to this phenomena are written by the author and Checklist will contract or expand the sentence.  
The list is not exhaustive and can be always be expanded

In [None]:
def create_contractions(data):
    """This function creates contractions from a list of sentences.
    Either contrated or expanded. Using Checklist Perturbation.contractions

    :data is a list of sententes

    Returns a list of nested list [sent,sent]
    """

    ret = Perturb.perturb(data, Perturb.contractions)


    return ret.data

def create_dictionary_Format(data,sents):
    """ 
    This function is exclusively for formatting a new dictionary.
    
    :data is {sent:verb_indx}
    :sents is list of coupled lists [contracted , expanded]

    Output: A nested dictionary
    {sentence:(contracted_sentence:verb_indx)}
    """
    co=np.array(sents).T[1]
    new_dict = {s: x for s,x in zip(co,data.items())}
    return new_dict

In [None]:
#THIS sentences were manually generated taking inspiration from Checklist perturbator.py where a various contractions are listed

data = {'it\'s a wonderfull day': 1,
        "where did he go?":1,
        "There's some pesto left": 1,
        "He was ought not to do it": 2,
        "I could've tried that as well": 2,
        "They will leave the house": 1,
        "That would be creazy": 1,
        "we are here": 1,
        "Mark had not see that coming": 1,
        "She will be a great candidate": 1,
        "Mary is not a nurse.": 1,
        "He's gone already": 1,
        "I would like some tea": 1,
        "who is there?": 1,
        "I could not eat some food now": 1,
        "We've decided to change house": 1,
        "I must not lose my temper": 1,
        "You might not want to do that": 1
}


In [None]:
sents=create_contractions(list(data.keys()))
out_dict=create_dictionary_Format(data,sents)
out_dict,len(out_dict)

({'it is a wonderfull day': ("it's a wonderfull day", 1),
  "where'd he go?": ('where did he go?', 1),
  'There is some pesto left': ("There's some pesto left", 1),
  "He was oughtn't to do it": ('He was ought not to do it', 2),
  'I could have tried that as well': ("I could've tried that as well", 2),
  "They'll leave the house": ('They will leave the house', 1),
  "That'd be creazy": ('That would be creazy', 1),
  "we're here": ('we are here', 1),
  "Mark hadn't see that coming": ('Mark had not see that coming', 1),
  "She'll be a great candidate": ('She will be a great candidate', 1),
  "Mary isn't a nurse.": ('Mary is not a nurse.', 1),
  'He is gone already': ("He's gone already", 1),
  "I'd like some tea": ('I would like some tea', 1),
  "who's there?": ('who is there?', 1),
  "I couldn't eat some food now": ('I could not eat some food now', 1),
  'We have decided to change house': ("We've decided to change house", 1),
  "I mustn't lose my temper": ('I must not lose my temper', 1

In [None]:
with open(path+"contracted_sentences.json","w") as f:
    json.dump(out_dict,f)

### Predicate irregular inflections
Semi automatic creation of sentences using a list of irregular inflected verb.  
List was found online / written by the authros. Example sentences are contructed with Large Language Model RoBerta integrated in CheckList library.

In [None]:
def create_inflected_sentences(irregular_inflections):
    """This function creates a dictionary in the shape {label:sentence}. Where label is the inflected verb form.
    :irregular_inflections is a list of the irregular verbs to be used.
    returns Dict
    """
    editor = Editor()
    #This will add the irregular verbs to the lexicon so that we can use them in the template.
    editor.add_lexicon('irr_verb', irregular_inflections,remove_duplicates=True)

    #This will create 1000 samples sentences and their lables will be the irregular verb picked.
    #{mask} is a special token that will be replaced by a random word suggested by the Language model.
    #{fist_name} is a special token that will be replaced by a random first name in the lexicon.
    ret = editor.template('{first_name} {irr_verb} {a:mask} {mask}.',nsamples=100,labels='{irr_verb}')

    #This creates a dictionary in the shape {label:sentence}. Where label is the verb.
    #Carefull, this will be much smaller than the number of samples because some of the sentences will be duplicates.
    inflected_sentences=dict(zip(ret.labels,ret.data))


    return inflected_sentences

In [None]:
irregular_inflections=['Beheld', 'Dwelt', 'Flung', 'Broadcast', 'Clung', 'Dared', 'Fitted', 'Forgave', 'Grinded', 'Hanged', 'Knelt', 'Laid', 'Led', 'Leant', 'Molten', 'Mistook', 'Proved', 'Rose', 'Sawn', 'Sought', 'Sewed', 'Shaven', 'Slit', 'Snuck', 'Span', 'Spoiled', 'Spring', 'Stuck', 'Strode', 'Struck', 'Swung', 'Torn', 'Undertook', 'Vext', 'Wet', 'Wrote']

In [None]:
inflected_sentences=create_inflected_sentences(irregular_inflections)
len(inflected_sentences),inflected_sentences

(35,
 {'Leant': 'Christopher Leant a H __.',
  'Knelt': 'Elizabeth Knelt an Old Woman.',
  'Snuck': 'Katherine Snuck an Old Swan.',
  'Rose': 'Al Rose a L Key.',
  'Forgave': 'Al Forgave a New Thing.',
  'Span': 'Joan Span a New Order.',
  'Strode': 'Lauren Strode a L Stand.',
  'Sewed': 'Christopher Sewed a L 3.',
  'Undertook': 'Steve Undertook an Old Rose.',
  'Led': 'Peter Led a H in.',
  'Dwelt': 'Frances Dwelt a L No.',
  'Swung': 'Paul Swung a H Band.',
  'Sawn': 'Fiona Sawn an Old Girl.',
  'Vext': 'Gary Vext an A Rose.',
  'Flung': 'Frederick Flung a H Bar.',
  'Wet': 'Donald Wet a R Road.',
  'Slit': 'Catherine Slit a H Ranch.',
  'Fitted': 'Henry Fitted a L 0.',
  'Mistook': 'Emily Mistook an In A.',
  'Dared': 'Jennifer Dared an Old Friend.',
  'Spoiled': 'Al Spoiled a L Stand.',
  'Shaven': 'Sue Shaven a H in.',
  'Sought': 'Martha Sought a L Ring.',
  'Molten': 'Bill Molten an Old Thing.',
  'Hanged': 'Susan Hanged an Old Car.',
  'Stuck': 'Frank Stuck a R Post.',
  'Prov

In [102]:
with open(path+"inflected_sentences.json","w") as f:
    json.dump(inflected_sentences,f)

FileNotFoundError: ignored

# Typos
For the creation of the this dataset we take a list of verbs (possibly transitive) from ChatGPT/the internet. For each verb we add a typo by switching two characters with the help of Checklist perturbator class. We then use Checklist template to fill a tamplate with the perturbated verb.

In [117]:
def create_verb_typos(verb_list):
  """This function creates a list of sentences with a perturbate verbs (typos).
  It first generated  the list of wrong verbs from the input list and then create as many sentences.
  """
  editor=Editor()
  verb_typos=[Perturb.add_typos(x) for x in verb_list]

  editor.add_lexicon('verb_typos', verb_typos,remove_duplicates=True)
  editor.add_lexicon('adj', ['good', 'bad', 'great', 'terrible','wierd','cool','aweful'])

  ret = editor.template('They {verb_typos} a {adj} {mask}.',nsamples=len(verb_list), remove_duplicates=True,labels='{verb_typos}')
  return dict(zip(ret.labels, ret.data))


In [118]:
verb_list=['Beheld', 'Flung', 'Broadcast', 'Forgave', 'Grinded', 'Hanged', 'Laid', 'Led', 'Leant', 'Molten', 'Mistook', 'Proved', 'Sawn', 'Sought', 'Sewed', 'Shaven', 'Slit', 'Snuck', 'Span', 'Spoiled', 'Stuck', 'Strode', 'Struck', 'Swung', 'Torn', 'Undertook', 'Vext', 'Wet', 'Wrote','eat', 'drink', 'throw', 'catch', 'write', 'read', 'hit', 'kick', 'open', 'close', 'cook', 'bake', 'paint', 'draw', 'build', 'break', 'repair', 'clean', 'wash', 'drive', 'ride', 'carry', 'lift', 'play', 'sing', 'dance', 'love', 'hate', 'need', 'want', 'like', 'dislike', 'teach', 'learn', 'understand', 'know', 'remember', 'forget', 'help', 'hurt', 'show']


In [119]:
sents=create_verb_typos(verb_list)
#sents

In [100]:
with open(path+"verb_typos_sentence.json","w") as f:
    json.dump(inflected_sentences,f)

NameError: ignored

# ARGOUMENTS CLASSIFICATION

## ROBUSTNESS

In [None]:
editor = Editor()
#This will add the irregular verbs to the lexicon so that we can use them in the template.
#editor.add_lexicon('irr_verb', irregular_inflections,remove_duplicates=True)

#This will create 1000 samples sentences and their lables will be the irregular verb picked.
#{mask} is a special token that will be replaced by a random word suggested by the Language model.
#{fist_name} is a special token that will be replaced by a random first name in the lexicon.
ret = editor.template('{first_name} saw {a:mask} {mask}.',nsamples=5,keep_original=True)

#This creates a dictionary in the shape {label:sentence}. Where label is the verb.
#Carefull, this will be much smaller than the number of samples because some of the sentences will be duplicates.
#inflected_sentences=dict(zip(ret.labels,ret.data))
ret

  to_pred = torch.tensor(to_pred, device=self.device).to(torch.int64)


MunchWithAdd({'data': ['Jason saw an empty room.', 'Alice saw an innocent child.', 'Jerry saw an imminent threat.', 'Paul saw an old man.', 'Deborah saw an empty building.']})