## This notebook containes all the functions and manually created sentences to further generate (Automatically or semi-automatically) the test sets used in *Challenging-SRL*

In [None]:
"""pip install checklist"""

In [5]:
import numpy as np
import json
import random
from checklist.editor import Editor
from checklist.perturb import Perturb

# Predicate identification test set

In [33]:
#All the sentences generated will be stred in this folder: Might need to create it first
path="./Data/Predicate_identification/"

### Predicate contraction

Creation semi manual of the contraction data test.  
Some sentences with predicates that can be subjected to this phenomena are written by the author and Checklist will contract or expand the sentence.  
The list is not exhaustive and can be always be expanded

In [6]:
def create_contractions(data):
    """This function creates contractions from a list of sentences.
    Either contrated or expanded. Using Checklist Perturbation.contractions

    :data is a list of sententes

    Returns a list of nested list [sent,sent]
    """

    ret = Perturb.perturb(data, Perturb.contractions)


    return ret.data

def create_dictionary_Format(data,sents):
    """ 
    This function is exclusively for formatting a new dictionary.
    
    :data is {sent:verb_indx}
    :sents is list of coupled lists [contracted , expanded]

    Output: A nested dictionary
    {sentence:(contracted_sentence:verb_indx)}
    """
    co=np.array(sents).T[1]
    new_dict = {s: x for s,x in zip(co,data.items())}
    return new_dict

In [7]:
#THIS sentences were manually generated taking inspiration from Checklist perturbator.py where a various contractions are listed

data = {'it\'s a wonderfull day': 1,
        "where did he go?":1,
        "There's some pesto left": 1,
        "He was ought not to do it": 2,
        "I could've tried that as well": 2,
        "They will leave the house": 1,
        "That would be creazy": 1,
        "we are here": 1,
        "Mark had not see that coming": 1,
        "She will be a great candidate": 1,
        "Mary is not a nurse.": 1,
        "He's gone already": 1,
        "I would like some tea": 1,
        "who is there?": 1,
        "I could not eat some food now": 1,
        "We've decided to change house": 1,
        "I must not lose my temper": 1,
        "You might not want to do that": 1
}


In [8]:
sents=create_contractions(list(data.keys()))
out_dict=create_dictionary_Format(data,sents)
out_dict,len(out_dict)

({'it is a wonderfull day': ("it's a wonderfull day", 1),
  "where'd he go?": ('where did he go?', 1),
  'There is some pesto left': ("There's some pesto left", 1),
  "He was oughtn't to do it": ('He was ought not to do it', 2),
  'I could have tried that as well': ("I could've tried that as well", 2),
  "They'll leave the house": ('They will leave the house', 1),
  "That'd be creazy": ('That would be creazy', 1),
  "we're here": ('we are here', 1),
  "Mark hadn't see that coming": ('Mark had not see that coming', 1),
  "She'll be a great candidate": ('She will be a great candidate', 1),
  "Mary isn't a nurse.": ('Mary is not a nurse.', 1),
  'He is gone already': ("He's gone already", 1),
  "I'd like some tea": ('I would like some tea', 1),
  "who's there?": ('who is there?', 1),
  "I couldn't eat some food now": ('I could not eat some food now', 1),
  'We have decided to change house': ("We've decided to change house", 1),
  "I mustn't lose my temper": ('I must not lose my temper', 1

In [9]:
with open(path+"contracted_sentences.json","w") as f:
    json.dump(out_dict,f)

### Predicate irregular inflections
Semi automatic creation of sentences using a list of irregular inflected verb.  
List was found online / written by the authros. Example sentences are contructed with Large Language Model RoBerta integrated in CheckList library.

In [10]:
def create_inflected_sentences(irregular_inflections):
    """This function creates a dictionary in the shape {label:sentence}. Where label is the inflected verb form.
    :irregular_inflections is a list of the irregular verbs to be used.
    returns Dict
    """
    editor = Editor()
    #This will add the irregular verbs to the lexicon so that we can use them in the template.
    editor.add_lexicon('irr_verb', irregular_inflections,remove_duplicates=True)

    #This will create 1000 samples sentences and their lables will be the irregular verb picked.
    #{mask} is a special token that will be replaced by a random word suggested by the Language model.
    #{fist_name} is a special token that will be replaced by a random first name in the lexicon.
    ret = editor.template('{first_name} {irr_verb} {a:mask} {mask}.',nsamples=100,labels='{irr_verb}')

    #This creates a dictionary in the shape {label:sentence}. Where label is the verb.
    #Carefull, this will be much smaller than the number of samples because some of the sentences will be duplicates.
    inflected_sentences=dict(zip(ret.labels,ret.data))


    return inflected_sentences

In [29]:
#MIght take a while cuz RoberTa is a big model
irregular_inflections=['Beheld', 'Dwelt', 'Flung', 'Broadcast', 'Clung', 'Dared', 'Fitted', 'Forgave', 'Grinded', 'Hanged', 'Knelt', 'Laid', 'Led', 'Leant', 'Molten', 'Mistook', 'Proved', 'Rose', 'Sawn', 'Sought', 'Sewed', 'Shaven', 'Slit', 'Snuck', 'Span', 'Spoiled', 'Spring', 'Stuck', 'Strode', 'Struck', 'Swung', 'Torn', 'Undertook', 'Vext', 'Wet', 'Wrote']
irregular_inflections=[x.lower() for x in irregular_inflections]
inflected_sentences=create_inflected_sentences(irregular_inflections)
len(inflected_sentences),inflected_sentences
with open(path+"inflected_sentences.json","w") as f:
    json.dump(inflected_sentences,f)

### Typos
For the creation of the this dataset we take a list of verbs (possibly transitive) from ChatGPT/the internet. For each verb we add a typo by switching two characters with the help of Checklist perturbator class. We then use Checklist template to fill a tamplate with the perturbated verb.

In [34]:
def create_verb_typos(verb_list):
  """This function creates a list of sentences with a perturbate verbs (typos).
  It first generated  the list of wrong verbs from the input list and then create as many sentences.
  """
  editor=Editor()
  verb_typos=[Perturb.add_typos(x) for x in verb_list]

  editor.add_lexicon('verb_typos', verb_typos,remove_duplicates=True)
  editor.add_lexicon('adj', ['good', 'bad', 'great', 'terrible','wierd','cool','aweful'])

  ret = editor.template('They {verb_typos} a {adj} {mask}.',nsamples=len(verb_list), remove_duplicates=True,labels='{verb_typos}')
  return dict(zip(ret.labels, ret.data))


In [31]:
verb_list=['Beheld', 'Flung', 'Broadcast', 'Forgave', 'Grinded', 'Hanged', 'Laid', 'Led', 'Leant', 'Molten', 'Mistook', 'Proved', 'Sawn', 'Sought', 'Sewed', 'Shaven', 'Slit', 'Snuck', 'Span', 'Spoiled', 'Stuck', 'Strode', 'Struck', 'Swung', 'Torn', 'Undertook', 'Vext', 'Wet', 'Wrote','eat', 'drink', 'throw', 'catch', 'write', 'read', 'hit', 'kick', 'open', 'close', 'cook', 'bake', 'paint', 'draw', 'build', 'break', 'repair', 'clean', 'wash', 'drive', 'ride', 'carry', 'lift', 'play', 'sing', 'dance', 'love', 'hate', 'need', 'want', 'like', 'dislike', 'teach', 'learn', 'understand', 'know', 'remember', 'forget', 'help', 'hurt', 'show']

sents=create_verb_typos(verb_list)

with open(path+"verb_typos_sentence.json","w") as f:
    json.dump(sents,f)

KeyboardInterrupt: 

### Slang
This data were created with the help of chat gpt

In [20]:
sentences = {
    "wanna": "I wanna go to the movies tonight.",
    "gonna": "I'm gonna meet my friends at the mall.",
    "gotta": "I gotta finish my homework before I can go out.",
    "gimme": "Gimme a slice of pizza, please.",
    "lemme": "Lemme know if you need any help.",
    "dunno": "I dunno what to wear to the party.",
    "tryna": "I'm tryna get in shape for summer.",
    "Ima": "Ima buy a new car next month.",
    "Needa": "I Needa take a break from work.",
    "Hafta": "I hafta leave early today for a doctor's appointment.",
    "Whatcha": "Whatcha doing this weekend?",
    "C'mon": "C'mon, let's go to the park."
}

In [21]:
with open(path+"verbs_slang.json","w") as f:
    json.dump(sentences,f)

### New verbs
This data were created mostly by manually looking for new verbs

In [18]:
sentences = {
"google": "I need to google the address of the restaurant.",
    "zoom": "Let's zoom the meeting instead of meeting in person.",
    "binge-watching": "I'm binge-watching the new TV series this weekend.",
    "adulting": "I don't feel like adulting today, can we just stay in bed?",
    "ghosted": "He ghosted me after our first date and never replied to my messages.",
    "tweeting":"He has been tweeting aweful stuff",
    "flexed": "She flexed her designer bags on social media.",
    "stan": "I stan this new artist, their music is amazing!",
    "greenwash":"They like to greenwash their people",
    "terraform":"If we terraform another plants we have to make sure we build an equal society"
}

In [19]:
with open(path+"new_verbs.json","w") as f:
    json.dump(sentences,f)

# ARGOUMENTS CLASSIFICATION

In [25]:
#All the sentences generated will be stred in this folder:
path="./Data/Argument_classification/"

### Typos 
Here we create perturbed sentences with Checklist.
It is possible to perturbate the sentences up to n times.
From 1 to n typos dataset will be saved as well (to make a nice comparison)

In [None]:
def add_n_typos(sents,n):
  """
  This functions perturb a sentence with n typos
  Returns a dict like {original:Typos_sents}
  """
  original=sents
  for x in range(n):
    sents=[Perturb.add_typos(x) for x in sents]
  return dict(zip(original,sents))
  
def create_Multiple_typos_sentences(sents,n):
  """
  This function creates n dictionaries with sentences perturbed n times where n goes from 1 to n
  reutrns dict of dicts
  """
  dict_typos={}
  for i in range(1,n+1):
    dict_typos[i]=add_n_typos(sents,i)
  return dict_typos

In [None]:
editor = Editor()
ret = editor.template('{first_name} and {mask} {mask} {a:mask} {mask}.',nsamples=100,keep_original=True)
correct_sents=ret.data
all_typos_sents=create_Multiple_typos_sentences(correct_sents,4)

for i in (all_typos_sents.keys()):
  print(i)
  with open(path+f"sents_{i}_typos.json","w") as f:
    json.dump(all_typos_sents[i],f)

### NER

In [22]:
def create_NER_sent():
    golden_tags="['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'B-ARGM-LOC', 'I-ARGM-LOC']"
    editor = Editor()
    first1 = [x.split()[0] for x in editor.lexicons.male_from.Vietnam +  editor.lexicons.female_from.Vietnam]
    first2 = [x.split()[0] for x in editor.lexicons.male_from.Indonesia +  editor.lexicons.female_from.Nepal]
    last = [x.split()[0] for x in editor.lexicons.last_from.Cameroon + editor.lexicons.last_from.Palau]
    cityy=[x for x in editor.lexicons.country_city.Ethiopia + editor.lexicons.country_city.Russia+editor.lexicons.country_city.South_Africa if len(x.split())==1] 
    t = editor.template(' {first_name} {first_name1} {last_name1} saw {first1_name2} {first1_name3} {last_name4} in {city}', first_name=first1,first1_name=first2, last_name=last, city=cityy,meta=True, nsamples=100)
    return {golden_tags:t.data}


In [23]:
di=create_NER_sent()
with open(path+"NER_sentences.json","w") as f:
    json.dump(di,f)


## Coreference Study

In [27]:
"It annoys me the results of the election"
"Those are the keys of luis"

'Those are the keys of luis'

### PP attachment

In [None]:
["I fixed the car with a red logo","I fixed the car with a wretch"]
["I bought a computer with GPU"," I bought a computer wit bitcoins"]
["I went to the resturant by the Hutsin", "I went to the resturant by bike"]