<a href="https://colab.research.google.com/github/GabHoo/Challenging-SRL/blob/main/Test_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETTING UP
Run the following cells in the Settin Up section to be able to run any of the tests in this notebook with the two suggested models.  
Change the current value for both model path according to your own models location. If No models are found they will be downloaded

In [2]:
# INSTALL AND IMPORTS
"""
pip install allennlp
pip install allennlp-models
pip install -U spaCy
pip install checklist
"""
from allennlp.predictors import Predictor
import allennlp_models.tagging

import json
import os
from utils import *
import utils
import re


### Change the model here:

In [10]:
#LOADING MODEL
model="Bert"

if model=="Bert":
    model_name="structured-prediction-srl-bert.2020.12.15.tar.gz"
    path_model="models/"+model_name
elif model=="Bilstm":
    model_name="openie-model.2020.03.26.tar.gz"
    path_model="models/"+model_name
else:
    print("Model not found!")
    exit()

if os.path.exists(path_model):
    print("Model found!")
    predictor = Predictor.from_path(path_model)
else:
    predictor = Predictor.from_path("https://storage.googleapis.com/"+model_name)

#TESTING IF THE MODELS ARE LOADED CORRECTLY

pred=predictor.predict("SRL model was loaded succeffully!")
if pred :
    print((" ").join(pred['words'])) 
    

Model found!


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SRL model was loaded succeffully !


# Useful functions


In [4]:
def append_to_results(results_path,your_result):
    """Add results to the score board. 
    your_Results neets to be a dictionary alraedy'
    """
    if type(your_result) != dict:
        print("your_result needs to be a dictionary!")
        return
    
    with open(results_path, 'r') as f:
        data = json.load(f)
    
    data.update(your_result)

    with open(results_path, 'w') as f:
        json.dump(data, f,indent=4)


# PREDICATE IDENTIFICATION

In [4]:
#In this folder we have the data for the predicate classidication task. Change the path accordingly if files were moved.
path="./Data/Predicate_identification/"
results_path=f"./results_{model}_Predicate_Identification.json"
## we initialize also the dictionary that will contain the results
with open(results_path, 'w') as f:
    json.dump({"test":"failure_rate"},f)

## VOCABULARY+POS

### Test for contractions

In [5]:
data=json.load(open(path+'contracted_predicates.json'))
failure_rate=evaluate_PI_contractions_INV(predictor,data)
print(f"Failure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"contractions":failure_rate})

Failure rate: 0.0. Total number of tests: 18


### Test for irregular inflections

In [6]:
data=json.load(open(path+"inflected_predicates.json"))
failure_rate=evaluate_PI_inflections_MFT(predictor,data)
print(f"Failure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"inflected_sentences":failure_rate})

Failed for: Gary vext an innocent soul. did not detect vext

Failure rate: 2.857142857142857. Total number of tests: 35


## ROBUTSTNESS

In [37]:
data=json.load(open(path+"verb_typos_sentence.json"))
failure_rate=evaluate_PI_inflections_MFT(predictor,data)
print(f"Failure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"inflected_sentences":failure_rate})


Failed for: They aet a cool lot. did not detect aet

Failure rate: 1.8867924528301887. Total number of tests: 53


## AMBIGUITY

### Experiment with polysemic verbs [POLISEMIC]

In [7]:
data=json.load(open(path+'polysem_verbs_sentences.json'))
failure_rate=evaluate_PI_Polysem_DIR(predictor,data)
print(f"Failure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"polysemic_verbs":failure_rate})



Failed for: I always turn left at the stop sign. ... left found as a verb 
Failure rate: 3.4482758620689653. Total number of tests: 29


### Experiment with verbs being in different roles -ing [GERUNDS]

In [None]:
data=json.load(open(path+"gerunds.json"))
failure_rate=find_roleset_MFT(data,predictor,verboose=True)
print(f"\nFailure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"gerunds":failure_rate})

## RARITY

### SLANG

In [None]:
data=json.load(open(path+"verbs_slang.json"))
failure_rate=evaluate_PI_inflections_MFT(predictor,data)
print(f"Failure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"slang_verbs":failure_rate})


### NEW WORDS

In [None]:
data=json.load(open(path+"new_verbs.json"))
failure_rate=evaluate_PI_inflections_MFT(predictor,data)
print(f"Failure rate: {failure_rate}. Total number of tests: {len(data)}")
append_to_results(results_path,{"new_verbs":failure_rate})


# AROUGMENTS CLASSIFICATION

In [5]:
#In this folder we have the data for the predicate classidication task. Change the path accordingly if files were moved.
path="./Data/Argument_classification/"
results_path=f"./results_{model}_Argument_Classification.json"
## we initialize also the dictionary that will contain the results
with open(results_path, 'w') as f:
    json.dump({"test_name":"failure_rate"},f)

## VOCABULAIRTY+POS

###  Entity

In [None]:
with open(path+f"FirstNames_sents.json", 'r') as f:
        sentences = json.load(f)

labels=sentences["labels"]
sentences=sentences['data']
failure=eval_full_sent_BIOtags(sentences,labels,predictor,verbose=True)

print(f"\nRate of failure: ",failure,"Total number of example: ",len(sentences),"\n")

append_to_results(results_path,{f"FistNames":failure})


### Pronouns

In [None]:
with open(path+f"Pronouns_sents.json", 'r') as f:
        sentences = json.load(f)

labels=sentences["labels"]
sentences=sentences['data']
failure=eval_full_sent_BIOtags(sentences,labels,predictor,verbose=True)

print(f"\nRate of failure: ",failure,"Total number of example: ",len(sentences),"\n")

append_to_results(results_path,{f"Pronouns":failure})


## AMBIGUITY/TAXONOMY (PP-ATTACHMENT AMBIGUITY)

### PP-ATTACHMENT AMBIGUITY INV

In [55]:
with open(path+f"Inv_PPattachments.json", 'r') as f:
        sentences = json.load(f)

In [59]:
def eval_PP_INV(sentences,predictor,verbose=False):
    """
    Evaluate the model on the PP attachment  based on partial pos tags. Lables given are infact only the one of the PP
    """
    failure=0
    for c in sentences:
        s1,s2=c.keys()
        pred1=predictor.predict(s1)
        pred2=predictor.predict(s2)
        labels1=c[s1]
        labels2=c[s2]
        ll1=len(labels1)
        ll2=len(labels2)
        if pred1['verbs'][0]['tags'][-ll1:]!=labels1:
            failure+=1
            if verbose:
                print(f"Input sentences: {s1}")
                print(f"Predicted labels for PP: {pred1['verbs'][0]['tags'][-ll1:]} but should have been {labels1}")
            continue
        if pred2['verbs'][0]['tags'][-ll2:]!=labels2:
            failure+=1
            if verbose:                             
                print(f"Input sentences: {s2}")
                print(f"Predicted labels for PP: {pred2['verbs'][0]['tags'][-ll2:]} but should have been {labels2}")
                      

    return failure/len(sentences)*100

In [60]:
rate=eval_PP_INV(sentences,predictor,verbose=True)
print(f"\nRate of failure: ",rate,"Total number of example: ",len(sentences),"\n")
append_to_results(results_path,{f"PP_INV":rate})


Input sentences: I fixed the car with a red logo
Predicted labels for PP: ['B-ARG2', 'I-ARG2', 'I-ARG2', 'I-ARG2'] but should have been ['I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1']
Input sentences:  I bought a computer with bitcoins
Predicted labels for PP: ['I-ARG1', 'I-ARG1'] but should have been ['B-ARGM-MNR', 'I-ARGM-MNR']
Input sentences: I drink whiskey with soda
Predicted labels for PP: ['B-ARGM-COM', 'I-ARGM-COM'] but should have been ['I-ARG1', 'I-ARG1']

Rate of failure:  50.0 Total number of example:  6 



#### Big PP test

In [72]:
with open(path+"PP_proceesed_test.json","r") as f:
    di=json.load(f)

print(len(di))

rate=eval_PP_MFT(di,predictor,verbose=False)
print(f"\nRate of failure: ",rate,"Total number of example: ",len(di),"\n")
append_to_results(results_path,{f"PP_MFT":rate})

1826
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no verbs found
no ve

## SPAN IDENTIFICATION

### LONG NER

In [None]:

with open(path+"NER_sentences.json","r") as f:
    di=json.load(f)

golden=di['labels']
sentences=di['data']

rate=eval_full_sent_BIOtags(sentences,golden,predictor,verbose=False)
print(f"\nFailure rate: {rate}. Total number of tests: {len(sentences)}")
append_to_results(results_path,{f"NER_sents":rate})


### LONG SPAN ADJECTIVES

In [13]:
def eval_spanDetection(sents,start_indx,end_indx,predictor,verbose=False):
    """
    This function evaluates the span detection task. It takes as input a list of sentences, a list of labels, the start and end index of the span.
    If all element between the index are not detected in the same span, is a failure.
    returns failure rate.
    """
    fails=0
    for s in sents:
        pred=predictor.predict(s)
        preds=pred['verbs']
        found=False
        for p in preds: #looking for every predicate in the sentence
            print(p)
            span=p['tags'][start_indx:end_indx]
            #print(span)
            span=[x.split('-')[1] if x!='O' else 'O' for x in span]
            print(span)
            if len(set(span))==1: #if the span is all the same label
                found=True
            else:
                continue
        if found==False:
            fails+=1
            if verbose:
                print("\nThe span was never detected")
                print(pred)
    return (fails/len(sents)*100)

        

In [None]:
with open(path+"longspan_sents.json","r") as f:
    di=json.load(f)
sents=di['data']
sents=sents[:30]
start,end=di["indexes"]
rate=eval_spanDetection(sents,start,end,predictor,verbose=True)
print(f"\nFailure rate: {rate}. Total number of tests: {len(sents)}")
append_to_results(results_path,{f"longspan":rate})

In [36]:
span

NameError: name 'span' is not defined

## ROBUSTNESS

### Typos

In [None]:
for i in range(1,5):#becuase we have 4 files like that

    with open(path+f"sents_{i}_typos.json", 'r') as f:
        sentences = json.load(f)

    labels=sentences.pop("labels")
    sentences=list(sentences.values())
    rate_typos_n=eval_full_sent_BIOtags(sentences,labels,predictor,verbose=False)

    print(f"Rate of failure with {i} typos per sentence: ",rate_typos_n)

    append_to_results(results_path,{f"sents_{i}_typos":rate_typos_n})


## PARAPHRASING

### passive and active trasnformarion

In [20]:
with open(path+"activepassive_sentences.json", 'r') as f:
    sentences = json.load(f)


labelsActive=sentences.pop("labelsActive")
labelsPassive=sentences.pop("labelsPassive")

rate=eval_full_sent_BIOtags_INV(sentences,labelsActive,labelsPassive,predictor,verbose=False)

print(f"\nRate of failure: ",rate,"% Total number of example: ",len(sentences),"\n")

append_to_results(results_path,{f"ActivePassive":rate})


Error
[ARG0: The waiter] [V: served] [ARG1: the meal] . != ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'O']
[ARG2: The meal] was [V: served] [ARG0: by the waiter] . != ['B-ARG1', 'I-ARG1', 'O', 'B-V', 'B-ARG0', 'I-ARG0', 'I-ARG0', 'O']




Rate of failure:  5.0 % Total number of example:  20 



# The cimitery of tests... [DONT RUN]





### INCISI

In [194]:
sentences=["This, friend, is epic ","This friend is epic"]
for s in sentences:
  print(s)
  pred=predictor_Bert.predict(s)
  desc=[x['description'] for x in pred['verbs']]
  #print(verbs)
  print(desc)
  print("\n\n")

This, friend, is epic 
['[ARG1: This ,] [ARGM-DIS: friend] , [V: is] [ARG2: epic]']



This friend is epic
['[ARG1: This friend] [V: is] [ARG2: epic]']





In [None]:
(I might stay, tonight, with you)

### DIfferent frames for labels recognition
With mutiple verbs, argoument confusion 

### Saxon genitive

In [None]:
paired_sentences = [    ["The president's decision to withdraw troops from the region caused controversy among military leaders.",      "The decision of the president to withdraw troops from the region caused controversy among military leaders."],
    ["The CEO's success in turning around the struggling company was due in large part to her innovative marketing strategies.",      "The success of the CEO in turning around the struggling company was due in large part to her innovative marketing strategies."],
    ["The director's vision for the film was not fully realized due to budget constraints and scheduling conflicts.",      "The vision of the director for the film was not fully realized due to budget constraints and scheduling conflicts."],
    ["The professor's lecture on the history of ancient Rome was attended by a packed auditorium of eager students.",      "The lecture on the history of ancient Rome of the professor was attended by a packed auditorium of eager students."],
    ["The artist's latest work, a sculpture made entirely of recycled materials, was featured in a prominent gallery exhibition.",      "The latest work of the artist, a sculpture made entirely of recycled materials, was featured in a prominent gallery exhibition."],
    ["The athlete's rigorous training regimen and strict diet led to her record-breaking performance at the championship.",      "The rigorous training regimen and strict diet of the athlete led to her record-breaking performance at the championship."],
    ["The author's use of symbolism and metaphor in her novel added depth and complexity to the story.",      "The use of symbolism and metaphor in her novel of the author added depth and complexity to the story."]
]


In [None]:

### Questions phrasing
#Idea for testing is to take all the arguments from from the verb that is the same(not the auxialiry).
# All the arguments should be the same except for the verb.
with open(path+"question_rephrase.json", 'r') as f:
    sentences = json.load(f)

sentences=list(sentences.items())

rate=eval_predictor_INV(sentences,predictor,"evaluate_INV_sameArgs")

print(f"Rate of failure with for INV in question rephrasing task ",rate)

append_to_results(results_path,{f"question_rephrase":rate})

### Intransitive verbs

Intransitive verbs can be tricky:
So there Unaccusative and Uneragtive are intrasntivie verbs. The difference is that unergative the subject is considered an againt, initating the action. whereas unaccusative the subject is considered the patient of the action.
Naomi worked [argent]
Naomi fell  [theme/patinet]

an MFT test can just intransitive verbs being used as transitive verbs.

and a directionality test can be used to see if the model is able to recognize the difference between unergative and unaccusative verbs. 
like the butter melted - the butter smells


In [None]:
for s in sentences:
  print(s)
  pred=predictor.predict(s)
  desc=[x['description'] for x in pred['verbs']]
  #print(verbs)
  print(desc)
  print("\n\n")

The sun rises in the east.
['[ARG1: The sun] [V: rises] [ARGM-LOC: in the east] .']



He slept for 10 hours.
['[ARG0: He] [V: slept] [ARGM-TMP: for 10 hours] .']



They danced all night.
['[ARG0: They] [V: danced] [ARGM-TMP: all night] .']



The flowers bloomed in the spring.
['[ARG0: The flowers] [V: bloomed] [ARGM-TMP: in the spring] .']



The baby cried all night.
['[ARG0: The baby] [V: cried] [ARGM-TMP: all night] .']



The cat meowed loudly.
['[ARG0: The cat] [V: meowed] [ARGM-MNR: loudly] .']



The car stopped suddenly.
['[ARG1: The car] [V: stopped] [ARGM-MNR: suddenly] .']



She ran around the park.
['[ARG0: She] [V: ran] [ARGM-DIR: around the park] .']



The plane took off at 9 am.
['[ARG1: The plane] [V: took] off [ARGM-TMP: at 9 am] .']



The wind blew fiercely.
['[ARG1: The wind] [V: blew] [ARGM-MNR: fiercely] .']



The tree swayed in the wind.
['[ARG1: The tree] [V: swayed] [ARGM-LOC: in the wind] .']



The river flows downstream.
['[ARG1: The river] [V: flows] 

In [None]:
sentences = ['The snow fell gently from the sky.',
             'She laughed uncontrollably at the joke.',
             'The sun set over the horizon.',
             'The children played happily in the park.',
             'The fire burned brightly in the fireplace.',
             'The wind howled outside the window.',
             'The cake baked for an hour.',
             'The door opened slowly.',
             'The crowd cheered loudly at the concert.',
             'The athlete ran around the track.',
             'The tree grew tall and strong.',
             'The river flowed peacefully.',
             'The bird flew gracefully through the air.',
             'The fish swam quickly in the water.',
             'The dog barked loudly at the mailman.',
             'The car drove smoothly down the road.',
             'The plane flew high above the clouds.',
             'The moon shone brightly in the sky.',
             'The stars twinkled in the night sky.',
             'The flowers swayed in the breeze.']


In [None]:
for s in sentences:
  print(s)
  pred=predictor.predict(s)
  desc=[x['description'] for x in pred['verbs']]
  #print(verbs)
  print(desc)
  print("\n\n")

The snow fell gently from the sky.
['[ARG1: The snow] [V: fell] [ARGM-MNR: gently] [ARG3: from the sky] .']



She laughed uncontrollably at the joke.
['[ARG0: She] [V: laughed] [ARGM-MNR: uncontrollably] [ARG2: at the joke] .']



The sun set over the horizon.
['[ARG1: The sun] [V: set] [ARG2: over the horizon] .']



The children played happily in the park.
['[ARG0: The children] [V: played] [ARGM-MNR: happily] [ARGM-LOC: in the park] .']



The fire burned brightly in the fireplace.
['[ARG1: The fire] [V: burned] [ARGM-MNR: brightly] [ARGM-LOC: in the fireplace] .']



The wind howled outside the window.
['[ARG0: The wind] [V: howled] [ARGM-LOC: outside the window] .']



The cake baked for an hour.
['[ARG1: The cake] [V: baked] [ARGM-TMP: for an hour] .']



The door opened slowly.
['[ARG1: The door] [V: opened] [ARGM-MNR: slowly] .']



The crowd cheered loudly at the concert.
['[ARG0: The crowd] [V: cheered] [ARGM-MNR: loudly] [ARG1: at the concert] .']



The athlete ran around 

In [None]:
sent="Many people keep falling for this CON - GAME that lower taxes on the rich benefits everyone . "
predictor.predict(sent)

{'verbs': [{'verb': 'keep',
   'description': '[ARG0: Many people] [V: keep] [ARG1: falling for this CON - GAME that lower taxes on the rich benefits everyone] .',
   'tags': ['B-ARG0',
    'I-ARG0',
    'B-V',
    'B-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'O']},
  {'verb': 'falling',
   'description': '[ARG1: Many people] keep [V: falling] [ARGM-PRP: for this CON - GAME that lower taxes on the rich benefits everyone] .',
   'tags': ['B-ARG1',
    'I-ARG1',
    'O',
    'B-V',
    'B-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'I-ARGM-PRP',
    'O']}],
 'words': ['Many',
  'people',
  'keep',
  'falling',
  'for',
  'this',
  'CON',
  '-',
  'GAME',
  'that',
  'lower'

In [None]:
predictor.predict("He walked the dog to the park and back.")

{'verbs': [{'verb': 'walked',
   'description': '[ARG0: He] [V: walked] [ARG1: the dog] [ARGM-GOL: to the park and back] .',
   'tags': ['B-ARG0',
    'B-V',
    'B-ARG1',
    'I-ARG1',
    'B-ARGM-GOL',
    'I-ARGM-GOL',
    'I-ARGM-GOL',
    'I-ARGM-GOL',
    'I-ARGM-GOL',
    'O']}],
 'words': ['He',
  'walked',
  'the',
  'dog',
  'to',
  'the',
  'park',
  'and',
  'back',
  '.']}

In [None]:
predictor.predict("He walked with the dog to the park and back.")

{'verbs': [{'verb': 'walked',
   'description': '[ARG0: He] [V: walked] [ARGM-COM: with the dog] [ARGM-GOL: to the park and back] .',
   'tags': ['B-ARG0',
    'B-V',
    'B-ARGM-COM',
    'I-ARGM-COM',
    'I-ARGM-COM',
    'B-ARGM-GOL',
    'I-ARGM-GOL',
    'I-ARGM-GOL',
    'I-ARGM-GOL',
    'I-ARGM-GOL',
    'O']}],
 'words': ['He',
  'walked',
  'with',
  'the',
  'dog',
  'to',
  'the',
  'park',
  'and',
  'back',
  '.']}

In [None]:
predictor.predict("I see a man with a telescope under his arm")

{'verbs': [{'verb': 'see',
   'description': '[ARG0: I] [V: see] [ARG1: a man with a telescope under his arm]',
   'tags': ['B-ARG0',
    'B-V',
    'B-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1']}],
 'words': ['I',
  'see',
  'a',
  'man',
  'with',
  'a',
  'telescope',
  'under',
  'his',
  'arm']}

In [None]:
predictor.predict("The plane took a large turn")

{'verbs': [{'verb': 'took',
   'description': '[ARG0: The plane] [V: took] [ARG1: a large turn]',
   'tags': ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1']}],
 'words': ['The', 'plane', 'took', 'a', 'large', 'turn']}

### Parenthetical Elements 