In [1]:
import torch
import numpy as np 
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset, Dataset
from transformers import DataCollatorWithPadding
from datasets import load_metric,load_from_disk
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

from tqdm import tqdm
from datetime import datetime
import re
import random

import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to /home/bitfra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
## MODEL PARAMETERS
# we will use with Distil-BERT
language_model_name = "distilbert-base-uncased"

### Training Argurments

# this GPU should be enough for this task to handle 32 samples per batch
batch_size = 32

# optim
learning_rate = 1e-4
weight_decay = 0.001 # we could use e.g. 0.01 in case of very low and very high amount of data for regularization

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)
# load our dataset
adversarial_set = load_dataset("iperbole/adversarial_fever_nli")
nli_dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli",trust_remote_code=True)

# Metric def and save the dataset

In [None]:
#SAVE DATASET

# nli_dataset.save_to_disk('nli_data')

# copy_dataset = load_from_disk('nli_data')

# print(copy_dataset)

In [4]:
# METRIC DEFINITION

# Metrics
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels,accuracy="weigthed")["f1"]
   return {"accuracy": accuracy, "f1": f1}


# Utilities

In [6]:

def get_synonyms(word, pos):
    synonyms = set()
    for syn in wn.synsets(word, pos=pos):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)



def get_synset_by_offset(offset):
    # Estrai l'offset numerico dalla stringa e convertilo in un intero
    offset_num = int(''.join(filter(str.isdigit, offset)))
    pos = offset[-1]  # Prendi l'ultima lettera come parte del discorso
    return wn.synset_from_pos_and_offset(pos, offset_num)



def get_related_words(synset):
    # Ottenere sinonimi
    synonyms = synset.lemma_names()

    # Ottenere iperonimi (categorie generali)
    hypernyms = synset.hypernyms()
    hypernym_words = set()
    for hypernym in hypernyms:
        hypernym_words.update(hypernym.lemma_names())

    # Ottenere iponimi (sottocategorie)
    hyponyms = synset.hyponyms()
    hyponym_words = set()
    for hyponym in hyponyms:
        hyponym_words.update(hyponym.lemma_names())


    # Ottenere antonimi
    antonym_words = set()
    for lemma in synset.lemmas():
        for antonym in lemma.antonyms():
            antonym_words.add(antonym.name())

    return {
        'synonyms': list(synonyms),
        'hypernyms': list(hypernym_words),
        'hyponyms': list(hyponym_words),
        'antonyms': list(antonym_words)
    }



# extract the text from span information
def get_text_from_span(tokens, span):
    # return ' '.join(tokens[i]['rawText'] for i in range(span[0], span[1] + 1 ))
    result = ' '
    i = span[0]
    while span[0] <= i <= span[1]:
        try:

            result.join(tokens[i]['rawText'])
            i+=1
        except:
            print(f"tokens -> {tokens}")
            print(f"i -> {i}")
    return result


    # return ' '.join(tokens[i]['rawText'] while span[0] <= i <= span[1]: i+1)

# Idea0 (isnot)

In [12]:
# filtered_dataset = nli_dataset['train'].filter(lambda example: ' is not ' in example['premise'].lower() or ' is not ' in example['hypothesis'].lower())
filtered_dataset_isnot = nli_dataset['train'].filter(lambda example: ' is not ' in example['hypothesis'].lower())




# Define the set of words to check for
# keywords = {'marry', 'remarries', 'wed', 'marrying', 'marries', 'married'}

# Filter the dataset
# filtered_dataset_married = nli_dataset['train'].filter(lambda example: any(word in example['hypothesis'] for word in keywords))


In [15]:
print(len(filtered_dataset_married))

330


In [6]:
# IDEA change is not when is a contradiction 
def opposite_isnot(filtered_dataset):

    new_samples = []


    for elem in filtered_dataset:

        new_sample = {}
        text_prem = elem['premise']
        text_hp = elem['hypothesis']
        label = elem['label']


        if label == 'CONTRADICTION':
            new_sample['premise'] = text_prem
            new_sample['hypothesis'] = text_hp.replace(" is not ", " is ")
            new_sample['label'] = 'ENTAILMENT'
            new_samples.append(new_sample)

    return new_samples
        

# print(len(filtered_dataset_isnot))
new_samples = opposite_isnot(filtered_dataset_isnot)





NameError: name 'filtered_dataset_isnot' is not defined

In [14]:
# TO CONCATENATE NEW SAMPLES
new_samples = {
    'premise': ["New premise 1", "New premise 2"],
    'hypothesis': ["New hypothesis 1", "New hypothesis 2"],
    'label': [0, 1],  # Example labels: 0 for ENTAILMENT, 1 for CONTRADICTION, etc.
    # Add other fields if needed, e.g., 'srl' and 'wsd'
}

# Convert new samples to a Dataset
new_dataset = Dataset.from_dict(new_samples)


# Add new samples to the existing training dataset
prova = concatenate_datasets([filtered_dataset_isnot, new_dataset])
print(prova[-1])


ValueError: The features can't be aligned because the key label of features {'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)} has unexpected type - Value(dtype='int64', id=None) (expected either Value(dtype='string', id=None) or Value("null").

In [7]:
print(len(new_samples))

782


#  Extract info from dataset

In [48]:
def extract_info(filtered_dataset):

    most_frequent_verbs = {}
    most_frequent_pbframe = {}
    nb_data = set()
    new_samples = []
    info_elements = []
    possible_locations = []
    marriage_role = set()
    marriages_info = []
    verbs= set()
    pattern_time = re.compile(r'(?:(?P<day>\d{1,2})(?:st|nd|rd|th)?[ ,]*)?(?:(?P<month>[A-Za-z]+)[ ,]*)?(?:(?P<year>\d{4}))?')
    for index,elem in tqdm(enumerate(filtered_dataset)):

        new_sample = {}

        info_elem = {} # a dictionary in which I store the information of who is born
        marriage_info = {}

        premise = elem['premise']

        id = elem['id']
        info_elem['id'] = id

        text_pre = elem['premise']
        text_hp = elem['hypothesis']

        wsd_pre = elem['wsd']['premise']
        wsd_hp = elem['wsd']['hypothesis']
        srl_pre = elem['srl']['premise']
        srl_hp = elem['srl']['hypothesis']
        subject_added = False
        marriage_added = False

        # print()
        # print(f"srl {srl['annotations']}")
        # print(f"srl {srl['tokens']}")


        # matches = re.findall(pattern_birth, premise)


        tokens_pre = srl_pre['tokens']
        tokens_hp= srl_hp['tokens']
        annotations_pre = srl_pre['annotations']
        annotations_hp = srl_hp['annotations']


        # find the born event in srl annotations


        for annotation_pre in annotations_pre:

            #reference verb of annotation
            token_index = annotation_pre['tokenIndex']
            verb = tokens_pre[token_index]["rawText"]
            # print(f"verb = {verb}")
            verbatlasAnn =  annotation_pre['verbatlas']
            propbankAnn = annotation_pre['englishPropbank']
            # print(verbatlasAnn)
            # take the roles 
            roles = verbatlasAnn['roles']
            frame_name = verbatlasAnn['frameName']
            frame_name_pb = propbankAnn['frameName']


            if (frame_name_pb in most_frequent_pbframe):
                most_frequent_pbframe[frame_name_pb] +=1
            else:
                most_frequent_pbframe[frame_name_pb] = 0


            if (verb in most_frequent_verbs):
                most_frequent_verbs[verb] +=1
            else:
                most_frequent_verbs[verb] = 1

           

    return most_frequent_verbs,most_frequent_pbframe


most_frequent_verbs,most_frequent_pbframe = extract_info(nli_dataset['train'])

51086it [00:48, 1042.59it/s]


In [50]:
# Ordina il dizionario in base ai valori in ordine decrescente
most_frequent_pbframe_sorted = dict(sorted(most_frequent_pbframe.items(), key=lambda item: item[1], reverse=True))

# Stampa il dizionario ordinato
for chiave, valore in most_frequent_pbframe_sorted.items():
    print(f"{chiave}: {valore}")

be.01: 54298
bear.02: 10215
star.01: 7927
know.01: 7891
be.03: 7287
direct.01: 6440
include.01: 6279
win.01: 6193
have.01: 5611
write.01: 5550
release.01: 5171
become.01: 3568
receive.01: 3395
produce.01: 3296
appear.01: 2953
have.03: 2527
begin.01: 2455
base.02: 2296
play.02: 2014
earn.01: 1864
follow.01: 1740
nominate.01: 1719
create.01: 1672
serve.01: 1596
feature.01: 1525
sell.01: 1380
premiere.01: 1326
found.01: 1198
play.01: 1197
work.01: 1141
portray.01: 1138
publish.01: 1069
support.01: 1067
make.02: 1063
rank.01: 1047
lead.01: 1020
refer.01: 1002
name.01: 960
form.01: 944
call.01: 942
locate.01: 868
record.01: 866
use.01: 859
animate.01: 802
take.01: 798
develop.02: 789
consider.01: 767
continue.01: 764
reach.01: 760
gain.02: 753
perform.01: 750
base.01: 749
air.01: 738
act.01: 710
establish.01: 680
achieve.01: 679
rise.01: 669
go.15: 667
consist.01: 651
give.01: 632
find.01: 628
adapt.01: 582
sign.02: 576
die.01: 565
start.01: 560
hold.01: 554
debut.01: 544
speak.01: 522
retu

# Scheme to follow to extract info from srl

In [24]:
# print(type(nli_dataset["train"]))
# print(type(nli_dataset["train"][0]))
# print(nli_dataset["train"])

#TODO UNDERSTAND BEFORE GO AHEAD!
#THIS IS THE SCHEME THAT YOU HAVE TO FOLLOW TO IMPLEMENT THE VARIOUS IDEA THAT YOU HAVE.    

verbatlasFrame_pre = set()
probankFrame_pre = set()
verbatlasRoles_pre = set()
probankRoles_pre = set()


verbatlasFrame_hp = set()
probankFrame_hp = set()
verbatlasRoles_hp = set()
probankRoles_hp = set()


verbatlasLocations_pre = set()
probankLocations_pre = set()
verbatlasLocations_hp = set()
probankLocations_hp = set()

count = 0


# FOR EACH ELEMENT
for elem in nli_dataset["train"]:
    srl_hp = elem['srl']['hypothesis']
    srl_pre = elem['srl']['premise']

    tokens_pre = srl_pre['tokens']
    annotations_pre = srl_pre['annotations']    
    
    tokens_hp = srl_hp['tokens']
    annotations_hp = srl_hp['annotations']


    ## If you want to find a particular element
    # if elem['id'] == '15454': 
    #     count = 1
    #     print("trovato")
    #     trovato_tokens = tokens
    #     trovato_annotations = annotations

#EXTRACT INFO FROM PREMISE
    for annotation in annotations_hp:
        token_index = annotation['tokenIndex']
        verb = tokens_hp[token_index]['rawText']
        print(f"Verb: {verb}\n")

        print("VerbAtlas:")
        verbatlas = annotation['verbatlas']
        frame_name = verbatlas['frameName']
        verbatlasFrame_hp.add(frame_name)
        print(f" Frame: {frame_name}")
        for role_info in verbatlas['roles']:
            role = role_info['role']
            verbatlasRoles_hp.add(role)
            span = role_info['span']
            text = get_text_from_span(tokens_hp, span)
            if (role == 'Location'):
                verbatlasLocations_hp.add(text)
            print(f"  {role}: {text}")
        print()

        print("PropBank:")
        propbank = annotation['englishPropbank']
        frame_name = propbank['frameName']
        probankFrame_hp.add(frame_name)
        # print(f" Frame: {frame_name}")
        for role_info in propbank['roles']:
            role = role_info['role']
            probankRoles_hp.add(role)
            span = role_info['span']
            text = get_text_from_span(tokens_hp, span)
            if (role == 'ARGM-LOC'):
                probankLocations_hp.add(text)
            print(f"  {role}: {text}")
        print()


## EXTRACT INFO FROM HYPOTHEIS
    for annotation in annotations_hp:
        token_index = annotation['tokenIndex']
        verb = tokens_hp[token_index]['rawText']
        print(f"Verb: {verb}\n")

        print("VerbAtlas:")
        verbatlas = annotation['verbatlas']
        frame_name = verbatlas['frameName']
        verbatlasFrame_hp.add(frame_name)
        print(f" Frame: {frame_name}")
        for role_info in verbatlas['roles']:
            role = role_info['role']
            verbatlasRoles_hp.add(role)
            span = role_info['span']
            text = get_text_from_span(tokens_hp, span)
            if (role == 'Location'):
                verbatlasLocations_hp.add(text)
            print(f"  {role}: {text}")
        print()

        print("PropBank:")
        propbank = annotation['englishPropbank']
        frame_name = propbank['frameName']
        probankFrame_hp.add(frame_name)
        # print(f" Frame: {frame_name}")
        for role_info in propbank['roles']:
            role = role_info['role']
            probankRoles_hp.add(role)
            span = role_info['span']
            text = get_text_from_span(tokens_hp, span)
            if (role == 'ARGM-LOC'):
                probankLocations_hp.add(text)
            print(f"  {role}: {text}")
        print()



    
    # if count==1:break
    # else:count+=1

    # break







Verb: is

VerbAtlas:
 Frame: COPULA
  Theme: Roman Atwood is
  Attribute: a content creator .

PropBank:
  ARG1: Roman Atwood is
  ARG2: a content creator .

Verb: is

VerbAtlas:
 Frame: COPULA
  Theme: Roman Atwood is
  Attribute: a content creator .

PropBank:
  ARG1: Roman Atwood is
  ARG2: a content creator .

Verb: play

VerbAtlas:
 Frame: PLAY_SPORT/GAME
  Agent: The Boston Celtics play
  Theme: their home games at
  Location: at TD Garden .

PropBank:
  ARG0: The Boston Celtics play
  ARG1: their home games at
  ARGM-LOC: at TD Garden .

Verb: play

VerbAtlas:
 Frame: PLAY_SPORT/GAME
  Agent: The Boston Celtics play
  Theme: their home games at
  Location: at TD Garden .

PropBank:
  ARG0: The Boston Celtics play
  ARG1: their home games at
  ARGM-LOC: at TD Garden .

Verb: is

VerbAtlas:
 Frame: EXIST_LIVE
  Theme: a movie called The Hunger Games .

PropBank:
  ARG1: a movie called The Hunger Games .

Verb: called

VerbAtlas:
 Frame: NAME
  Theme: a movie called
  Attribute: Th

IndexError: list index out of range

In [27]:
# print(type(nli_dataset["train"]))
# print(type(nli_dataset["train"][0]))
# print(nli_dataset["train"])

#TODO UNDERSTAND BEFORE GO AHEAD!

verbatlasFrame = set()
probankFrame = set()
verbatlasRoles = set()
probankRoles = set()



for elem in nli_dataset["train"]:
    srl = elem['srl']['premise']

    tokens = srl['tokens']
    annotations = srl['annotations']


    for annotation in annotations:
        token_index = annotation['tokenIndex']
        verb = tokens[token_index]['rawText']
        # print(f"Verb: {verb}\n")

        # print("VerbAtlas:")
        verbatlas = annotation['verbatlas']
        frame_name = verbatlas['frameName']
        verbatlasFrame.add(frame_name)
        # print(f" Frame: {frame_name}")
        for role_info in verbatlas['roles']:
            role = role_info['role']
            verbatlasRoles.add(role)
            span = role_info['span']
            text = get_text_from_span(tokens, span)
            # print(f"  {role}: {text}")
        # print()

        # print("PropBank:")
        propbank = annotation['englishPropbank']
        frame_name = propbank['frameName']
        probankFrame.add(frame_name)
        # print(f" Frame: {frame_name}")
        for role_info in propbank['roles']:
            role = role_info['role']
            probankRoles.add(role)
            span = role_info['span']
            text = get_text_from_span(tokens, span)
            # print(f"  {role}: {text}")
        # print()


    # if count==5:break
    # else:count+=1

    # break

In [None]:
print("va locations: ",verbatlasLocations)
print("va locations: ",len(verbatlasLocations))
print("pb locations: ",probankLocations)
print("pb locations: ",len(probankLocations))

va locations:  {'in Minneapolis', 'in the Bundesliga , the top tier of the German football league system', 'in Tanzania', 'in Evanston , Illinois', '` Radioactive', 'in the spy - action television show Alias ( 2001 -- 2006 )', 'both indoors and outdoors', 'in the 2006 film The Pursuit of Happyness', 'in Italy', 'in the series Firefly and the follow - up film Serenity', 'outside the United States', 'in circuses', 'in the top five', 'in the battle', 'in the 58th Filmfare Awards South', 'at the southernmost tip of the Balkan Peninsula', 'in American comic books published by Marvel Comics , mostly in association with the X - Men', 'in the Lower Mainland region of British Columbia', 'primarily in California', 'in Continental Europe', 'in the fourth installment of the The Karate Kid franchise , The Next Karate Kid', 'south of the Sahara', 'in ROH', "in Brian De Palma ' s cult thriller Sisters ( 1973 ) ; in the slasher film Black Christmas ( 1974 ) ; and the drama The Great Waldo Pepper ( 197

In [None]:
# Sample set of strings


# Filter to include only words that start with a capital letter
filtered_words_set_va = {word for s in verbatlasLocations for word in s.split() if word[0].isupper()}
print(filtered_words_set_va)
print(len(filtered_words_set_va))


{'Class', 'Regum', 'Vienna', 'Boss', 'Prat', 'Forever', 'Potomac', 'Spencer', 'Erie', 'Confederations', 'Plutarch', 'Olivier', 'Peru', 'Hyattsville', 'Staple', 'Brazilian', 'Mildred', 'Galaxy', 'Clara', 'Everwood', 'Chalk', 'Open', 'Parlophone', 'Adriatic', 'Emily', 'Skarsgård', 'Sites', 'Bactrian', 'Virgil', 'Charing', 'PCs', 'Prix', 'Ivy', 'Avengers', 'Mercury', 'Rye', 'Kentwood', 'Vedanta', 'Lambeth', 'Base', 'Beach', 'Ayutla', 'MCs', 'Deauville', 'Groton', 'Subtle', 'Euclidean', 'Rose', 'Martina', 'Glenwood', 'Fallen', 'Vinci', 'Driver', 'Yugoslavia', 'Another', 'Knights', 'Stockholm', 'Dexter', 'Sunshine', 'Winds', 'Chaplin', 'Eve', 'Luck', 'Luton', 'Semitic', 'News', 'Wisconsin', 'Avenger', 'RCD-25', 'Action', 'Desperate', 'Amendment', 'Ali', 'Man', 'Laferrière', 'Blogette', 'Others', 'Bandai', 'Festivals', 'Memphis', 'Sparta', 'Iowa', 'Serbia', 'PAX', 'Dileep', 'Pakistani', 'Journals', 'Saddam', 'Dry', 'Punjabi', 'Vevey', 'GHV2', 'Trophy', 'Tales', 'Plane', 'TD', 'O', 'Kallerg

In [None]:
filtered_words_set_pb = {word for s in probankLocations for word in s.split() if word[0].isupper()}
print(filtered_words_set_pb)
print(len(filtered_words_set_pb))

{'Class', 'Regum', 'Vienna', 'Boss', 'Plutarch', 'Forever', 'Potomac', 'Spencer', 'Erie', 'Confederations', 'Prat', 'Olivier', 'Peru', 'Hyattsville', 'Brazilian', 'Mildred', 'Galaxy', 'Clara', 'Everwood', 'Chalk', 'Open', 'Parlophone', 'Emily', 'Skarsgård', 'Sites', 'Bactrian', 'Virgil', 'Charing', 'PCs', 'Prix', 'Ivy', 'Avengers', 'Mercury', 'Rye', 'Kentwood', 'Vedanta', 'Base', 'Beach', 'Ayutla', 'MCs', 'Deauville', 'Groton', 'Subtle', 'Euclidean', 'Rose', 'Martina', 'Glenwood', 'Fallen', 'Vinci', 'Driver', 'Yugoslavia', 'Another', 'Knights', 'Stockholm', 'Dexter', 'Sunshine', 'Winds', 'Chaplin', 'Eve', 'Luck', 'Luton', 'Semitic', 'News', 'Wisconsin', 'Avenger', 'RCD-25', 'Action', 'Desperate', 'Amendment', 'Ali', 'Man', 'Laferrière', 'Blogette', 'Bandai', 'Festivals', 'Memphis', 'Sparta', 'Iowa', 'Serbia', 'PAX', 'Dileep', 'Pakistani', 'Saddam', 'Dry', 'Punjabi', 'Vevey', 'GHV2', 'Trophy', 'Tales', 'Plane', 'TD', 'O', 'Pain', 'Antonio', 'Berlin', 'Extended', 'Disney', 'Mercy', 'Vi

In [None]:
print(filtered_words_set_va.difference(filtered_words_set_pb))

{'Tribunal', 'Radioactive', 'Ἀθηναίη', 'Plateau', 'Rosemont', 'Toledo', 'LL', 'Narnia', 'Staple', 'Taking', 'Brown', 'Bergens', 'Carey', 'Legislature', 'Stripped', 'Mixer', 'Elmet', 'Soprano', 'Baltic', 'Antilia', 'Moose', 'Timor', 'Adriatic', 'Paquin', 'Patel', 'Olympiada', 'Wentworth', 'Matt', 'Fayetteville', 'Runcorn', 'Hun', 'Drift', 'Salado', 'Library', 'Jennifer', 'Locus', 'Eva', 'Junipero', 'Files', 'Lambeth', 'Senators', 'Milky', 'Township', 'Leste', 'Caspian', 'Formosa', 'GD', 'Floyd', 'Naismith', 'Poor', 'Investigation', 'Ural', 'Neptune', 'Zooey', 'Beckinsale', 'Deschanel', 'Presley', 'Pont', 'Angel', 'Wisteria', 'Marmara', 'Norwich', 'Detention', 'Tea', 'Dharma', 'Others', 'Caleta', 'TOP', 'Symoné', 'Leagues', 'Duchovny', 'Endangered', 'Rondell', 'Journals', 'Vice', 'Imperial', 'Aslan', 'SmackDown', 'Keaton', 'Covenant', 'Qatar', 'Sahara', 'Baxter', 'Palais', 'Clares', 'Creed', 'Chatsworth', 'Beetlejuice', 'Gillian', 'Triangle', 'Marple', 'Anthrax', 'Loess', 'Selangor', 

In [None]:
# IDEA APPLICATA AD UN PARTICOLARE SAMPLE, DA CANCELLARE!


tokens = trovato_tokens
annotations = trovato_annotations

print("tokens: ",tokens)
print("annotations: ",annotations)



# the core of the algorithm
for annotation in annotations:
    token_index = annotation['tokenIndex']
    verb = tokens[token_index]['rawText']
    print(f"Verbo: {verb}\n")

    print("VerbAtlas:")
    verbatlas = annotation['verbatlas']
    frame_name = verbatlas['frameName']
    print(f" Frame: {frame_name}")
    for role_info in verbatlas['roles']:
        role = role_info['role']
        span = role_info['span']
        text = get_text_from_span(tokens, span)
        print(f"  {role}: {text}")
    print()

    print("PropBank:")
    propbank = annotation['englishPropbank']
    frame_name = propbank['frameName']
    print(f" Frame: {frame_name}")
    for role_info in propbank['roles']:
        role = role_info['role']
        span = role_info['span']
        text = get_text_from_span(tokens, span)
        print(f"  {role}: {text}")
    print()


tokens:  [{'index': 0, 'rawText': 'Tennis'}, {'index': 1, 'rawText': 'is'}, {'index': 2, 'rawText': 'a'}, {'index': 3, 'rawText': 'racket'}, {'index': 4, 'rawText': 'sport'}, {'index': 5, 'rawText': 'that'}, {'index': 6, 'rawText': 'can'}, {'index': 7, 'rawText': 'be'}, {'index': 8, 'rawText': 'played'}, {'index': 9, 'rawText': 'individually'}, {'index': 10, 'rawText': 'against'}, {'index': 11, 'rawText': 'a'}, {'index': 12, 'rawText': 'single'}, {'index': 13, 'rawText': 'opponent'}, {'index': 14, 'rawText': '('}, {'index': 15, 'rawText': 'singles'}, {'index': 16, 'rawText': ')'}, {'index': 17, 'rawText': 'or'}, {'index': 18, 'rawText': 'between'}, {'index': 19, 'rawText': 'two'}, {'index': 20, 'rawText': 'teams'}, {'index': 21, 'rawText': 'of'}, {'index': 22, 'rawText': 'two'}, {'index': 23, 'rawText': 'players'}, {'index': 24, 'rawText': 'each'}, {'index': 25, 'rawText': '('}, {'index': 26, 'rawText': 'doubles'}, {'index': 27, 'rawText': ')'}, {'index': 28, 'rawText': '.'}, {'index':

In [None]:
# print(f"verbatlasFrame {verbatlasFrame}")
# print(f"probankFrame {probankFrame}")
# print(f"verbatlasRoles {verbatlasRoles}")
# print(f"probankRoles {probankRoles}")


# verbatlasFrame_complete = verbatlasFrame
# probankFrame_complete = probankFrame
# verbatlasRoles_complete = verbatlasRoles
# probankRoles_complete = probankRoles

print(f"verbatlasFrame {verbatlasFrame_complete}")
print(f"probankFrame {probankFrame_complete}")
print(f"verbatlasRoles {verbatlasRoles_complete}")
print(f"probankRoles {probankRoles_complete}")



# print(f"verbatlasFrame {len(verbatlasFrame)}")
# print(f"probankFrame {len(probankFrame)}")
# print(f"verbatlasRoles {len(verbatlasRoles)}")
# print(f"probankRoles {len(probankRoles)}")




verbatlasFrame {'DISBAND_BREAK-UP', 'INSERT', 'ACCUSE', 'DISLIKE', 'DISCARD', 'DRIVE-BACK', 'PROMOTE', 'KNOW', 'IMAGINE', 'FRUSTRATE_DISAPPOINT', 'MEASURE_EVALUATE', 'ENTER', 'JUSTIFY_EXCUSE', 'FIGHT', 'HEAT', 'BE-LOCATED_BASE', 'CONTRACT-AN-ILLNESS_INFECT', 'LAND_GET-OFF', 'RECORD', 'COMBINE_MIX_UNITE', 'EXPLODE', 'STOP', 'CRITICIZE', 'SETTLE_CONCILIATE', 'CORRELATE', 'DRESS_WEAR', 'CAUSE-SMT', 'NEGOTIATE', 'LURE_ENTICE', 'REFUSE', 'ACHIEVE', 'ODORIZE', 'EXPLAIN', 'BORDER', 'RISK', 'MOVE-ONESELF', 'DIET', 'CHANGE-HANDS', 'EXIST-WITH-FEATURE', 'BEND', 'RESTRAIN', 'MANAGE', 'PUBLISH', 'DANCE', 'HARMONIZE', 'RECOGNIZE_ADMIT_IDENTIFY', 'SHOW', 'HANG', 'INCREASE_ENLARGE_MULTIPLY', 'SING', 'OVERLAP', 'LIGHTEN', 'BREAK_DETERIORATE', 'AFFECT', 'BUY', 'INFLUENCE', 'JOKE', 'CHARGE', 'BRING', 'TOLERATE', 'ASCRIBE', 'LEND', 'STABILIZE_SUPPORT-PHYSICALLY', 'COST', 'DEFEAT', 'FEEL', 'AUXILIARY', 'GROUP', 'OVERCOME_SURPASS', 'REMEMBER', 'FILL', 'PREPARE', 'CHANGE-APPEARANCE/STATE', 'PERCEIVE', 'OBLI

# Idea: Leveraging the information about birthday

In [18]:
# IDEA: leverage information about bithday
from tqdm import tqdm
from datetime import datetime
import sys




months_list = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]





def clean_name(name):
    # Remove trailing characters like '(' and strip any leading/trailing whitespace
    cleaned_name = re.sub(r'[^A-Za-z\s]', '', name).strip()
    # Split the name into components and check if all components start with a capital letter
    name_components = cleaned_name.split()
    if all(component[0].isupper() for component in name_components):
        return cleaned_name
    return False

def strip_name(name):
    pronoms = {'he','she'}
    final_name = ''
    cleaned_name = re.sub(r'[^A-Za-z\s]', '', name).strip()
    name_components = cleaned_name.split()
    for index,component in enumerate(name_components):
        if component.lower() in pronoms: return False
        if component[0].isupper(): final_name+=component.strip()
        if index != len(name_components)-1: final_name+=' '
    return final_name

def clean_time(date_str):
        # Remove parentheses and "born"
        date_str = date_str.strip('() ').replace('born ', '')
        try:
            # Parse the date
            date_obj = datetime.strptime(date_str, '%B, %d , %Y')
            # Format the date in the desired format
            return date_obj.strftime('%B, %d, %Y')
        except ValueError as e:
            print(f"Error parsing date: {date_str}")
            return False  # Return original date string if parsing fails
        
def extract_date_components(date_str,pattern,months_list):
    matches = pattern.findall(date_str)
    day, month, year = None, None, None
    for match in matches:
        if match[0]:
            day = match[0]
      
        if match[1]:
            month = match[1]
        if match[2]:
            year = match[2]
        
        # more check to avoid error!
        if day and not (1 <= int(day) <= 31):
            day = None
        if month and month not in months_list:
            month = None
        
    return day, month, year

def check_location(text):
    doc = nlp(text)
    locations = []
    
    # iterate over the entities 
    for ent in doc.ents:
        if ent.label_ == "GPE":  # GPE (Geopolitical Entity) is the label used for locations
            locations.append(ent.text)    

    return locations




#AUXILIAR METHOD FOR EACH CASE
def be():
    pass




## TODO: manca solo da creare nuovi sample
def create_birthday_samples(filtered_dataset,months_list):

    most_frequent_verbs = {}
    is_data = set()
    nb_data = set()
    new_samples = []
    info_elements = []
    possible_locations = []
    marriage_role = set()
    marriages_info = []
    verbs= set()
    pattern_time = re.compile(r'(?:(?P<day>\d{1,2})(?:st|nd|rd|th)?[ ,]*)?(?:(?P<month>[A-Za-z]+)[ ,]*)?(?:(?P<year>\d{4}))?')
    for index,elem in tqdm(enumerate(filtered_dataset)):

        new_sample = {}

        info_elem = {} # a dictionary in which I store the information of who is born
        marriage_info = {}

        premise = elem['premise']

        id = elem['id']
        info_elem['id'] = id

        text_pre = elem['premise']
        text_hp = elem['hypothesis']

        wsd_pre = elem['wsd']['premise']
        wsd_hp = elem['wsd']['hypothesis']
        srl_pre = elem['srl']['premise']
        srl_hp = elem['srl']['hypothesis']
        subject_added = False
        marriage_added = False

        # print()
        # print(f"srl {srl['annotations']}")
        # print(f"srl {srl['tokens']}")


        # matches = re.findall(pattern_birth, premise)


        tokens_pre = srl_pre['tokens']
        tokens_hp= srl_hp['tokens']
        annotations_pre = srl_pre['annotations']
        annotations_hp = srl_hp['annotations']


        # find the born event in srl annotations


        for annotation_pre in annotations_pre:

            #reference verb of annotation
            token_index = annotation_pre['tokenIndex']
            verb = tokens_pre[token_index]["rawText"]
            # print(f"verb = {verb}")
            verbatlasAnn =  annotation_pre['verbatlas']
            propbankAnn = annotation_pre['englishPropbank']
            # print(verbatlasAnn)
            # take the roles 
            roles = verbatlasAnn['roles']
            frame_name = verbatlasAnn['frameName']
            frame_name_pb = propbankAnn['frameName']


            if (verb in most_frequent_verbs):
                most_frequent_verbs[verb] +=1
            else:
                most_frequent_verbs[verb] = 0

            #Check born event
            if frame_name == 'GIVE-BIRTH':
                for role in roles:
                    span = role['span']
                    if role['role'] == 'Patient':
                        text = get_text_from_span(tokens_pre,span)
                        name = clean_name(text)
                        if (name):
                            subject_added = True
                            info_elem['subject'] = name
                    elif role['role'] == 'Time':
                        text = get_text_from_span(tokens_pre,span)
                        day,month,year = extract_date_components(text,pattern_time,months_list)
                        if (day or month or year):
                            format_time = ''
                            if (day!=None): 
                                format_time+=day+','
                            else: 
                                format_time+='None,'
                            if (month!=None): 
                                format_time+=month+','
                            else: 
                                format_time+='None,'
                            if (year!=None): 
                                format_time+=year+''
                            else: 
                                format_time+='None'
                            info_elem['whenBorn'] = format_time
                    elif role['role'] == 'Location':
                        text = get_text_from_span(tokens_pre,span)
                        locations = check_location(text)
                        if(len(locations)>0):
                            list_loc = ''
                            for i in range(len(locations)):
                                list_loc += locations[i]
                                if (i!= len(locations)-1): list_loc+=','
                            info_elem['whereBorn'] = list_loc
                    elif role['role'] == 'Agent':
                        text = get_text_from_span(tokens_pre,span)
                        name = clean_name(text)
                        if (name):
                            info_elem['parent'] = name
           
            roles_pb = propbankAnn['roles']
            


            if (frame_name_pb == "be.01"):
                for role in roles_pb:
                    role_info = role['role']
                    span = role['span']
                    text = get_text_from_span(tokens_pre, span)
                    is_data.add(role_info)








            # marry.01
            if (frame_name_pb == "sostituisci con marry.01"):
                # verbs.add(verb)
                


                arg1 = None
                arg2 = None
   

                for role in roles_pb:
                    role_info = role['role']
                    span = role['span']
                    text = get_text_from_span(tokens_pre, span)
                    marriage_role.add(role_info)
                    if role_info == 'ARG1':
                        name = strip_name(text)
                        if (name):
                            marriage_info['Arg1'] = name
                            arg1 = name

                    elif role_info == 'ARG2':
                        name = strip_name(text)
                        if (name):
                            marriage_info['Arg2'] = name
                            arg2 = name
                if arg1 and arg2: 
                    marriage_added = True
                    if verb in {'marry', 'remarries', 'wed', 'marrying', 'marries', 'married'}:
                        new_sample['hypothesis'] = arg1 + ' ' + verb + ' ' + arg2 
                        new_sample['premise'] = text_pre
                        new_samples.append(new_sample)




            # I need to extract information about other actions that have been done from patient
            # elif frame_name == '':


        if(subject_added):info_elements.append(info_elem)
        if(marriage_added):marriages_info.append(marriage_info)


        # print("info extracted -> ",info_elem)
        # break

    
    for elem in is_data:
        print("is data:: ",elem)

    return new_samples


info_elements = create_birthday_samples(nli_dataset['train'],months_list)
# print(len(nli_dataset['train']))

2963it [00:03, 952.26it/s] 


IndexError: list index out of range

In [42]:
for elem in info_elements:
    print(elem)

{'hypothesis': 'Earp  married    Urilla Sutherland Earp               ', 'premise': 'Wyatt Earp . In 1870 , Earp married his first wife , Urilla Sutherland Earp , who contracted typhoid fever and died shortly before their first child was to be born .'}
{'hypothesis': ' American          married  Paul McCartney   Beatles', 'premise': 'Linda Louise , Lady McCartney ( née Eastman ; formerly See ; September 24 , 1941 -- April 17 , 1998 ) was an American musician , photographer , animal rights activist , entrepreneur and publisher who was married to Paul McCartney of the Beatles .'}
{'hypothesis': 'Earp  married    Urilla Sutherland Earp               ', 'premise': 'Wyatt Earp . In 1870 , Earp married his first wife , Urilla Sutherland Earp , who contracted typhoid fever and died shortly before their first child was to be born . In 1876 , he followed his brother James to Dodge City , Kansas , where he became an assistant city marshal . Earp was a lifelong gambler and was always looking for

In [63]:
for elem in info_elements:
    print(elem)
    # if (elem.get('whereBorn')):
    #     print("id:  ",elem['id'])
    #     print("Locations: ",elem['whereBorn'])

print(len(info_elements))
print("ok")

{'Arg1': 'Rachel Getting'}
{'Arg1': 'Rachel Getting'}
{'Arg1': 'Rachel Getting'}
{'Time': 'In 1870 ,', 'Arg1': 'Earp married', 'Arg2': 'his first wife , Urilla Sutherland Earp , who contracted typhoid fever and died shortly before their first child was to be born .'}
{'Manner': 'How to', 'Arg2': 'a Millionaire ,'}
{'Manner': 'How to', 'Arg2': 'a Millionaire ,'}
{'Arg1': 'Peggy Sue Got'}
{'Arg1': 'He has', 'Time': 'three times and'}
{'Arg1': 'Man ('}
{'Arg2': 'with Children ('}
{'Arg1': 'an American musician , photographer , animal rights activist , entrepreneur and publisher who', 'R-Arg1': 'who was', 'Arg2': 'to Paul McCartney of the Beatles .'}
{'Time': 'In 1870 ,', 'Arg1': 'Earp married', 'Arg2': 'his first wife , Urilla Sutherland Earp , who contracted typhoid fever and died shortly before their first child was to be born .'}
{'Time': 'from 1998 to 2003 ,', 'Arg1': 'they married'}
{'Arg2': 'with Children ('}
{'Arg1': 'Man ('}
{'Arg1': 'Pompeo married', 'Arg2': 'producer Chris Ivery

In [24]:
# Print the last example to verify the addition
born_dataset = nli_dataset['train'].filter(lambda example: 'born' in example['premise'].lower() or 'born' in example['hypothesis'].lower())
pattern = r'\(\s*born\s+(?:[A-Za-z]+\s+\d{1,2}\s*,?\s*\d{4}|[A-Za-z]+\s+\d{1,2}\s*\d{4})\s*\)'
new_samples = create_birthday_samples(born_dataset,pattern,months_list)

0it [00:00, ?it/s]

match -> ( born December 24 , 1974 )
[{'tokenIndex': 4, 'verbatlas': {'frameName': 'GIVE-BIRTH', 'roles': [{'role': 'Patient', 'score': 1.0, 'span': [0, 3]}, {'role': 'Time', 'score': 1.0, 'span': [5, 9]}]}, 'englishPropbank': {'frameName': 'bear.02', 'roles': [{'role': 'ARG1', 'score': 1.0, 'span': [0, 3]}, {'role': 'ARGM-TMP', 'score': 1.0, 'span': [5, 9]}]}}, {'tokenIndex': 10, 'verbatlas': {'frameName': 'COPULA', 'roles': [{'role': 'Theme', 'score': 1.0, 'span': [0, 10]}, {'role': 'Attribute', 'score': 1.0, 'span': [11, 20]}]}, 'englishPropbank': {'frameName': 'be.01', 'roles': [{'role': 'ARG1', 'score': 1.0, 'span': [0, 10]}, {'role': 'ARG2', 'score': 1.0, 'span': [11, 20]}]}}, {'tokenIndex': 22, 'verbatlas': {'frameName': 'BEGIN', 'roles': [{'role': 'Agent', 'score': 1.0, 'span': [21, 22]}, {'role': 'Theme', 'score': 1.0, 'span': [23, 35]}, {'role': 'Time', 'score': 1.0, 'span': [35, 39]}]}, 'englishPropbank': {'frameName': 'begin.01', 'roles': [{'role': 'ARG0', 'score': 1.0, 'sp




# Model

In [None]:
# MODEL
# Initialize the model

model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3) # number of the classes to change to 3

tokenizer = AutoTokenizer.from_pretrained(language_model_name)

# padding with the most long sentence!
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # avoid to use can reduce the memory on GPU

#examples are batch!
def tokenize_function(examples):
    examples["label"] = [labels_mapping[label] for label in examples["label"]]
    return tokenizer(examples["premise"], examples["hypothesis"],padding = True, truncation=True)




# Tokenize the dataset ...
print("Tokenize the dataset ...")
labels_mapping = {"ENTAILMENT":0, "CONTRADICTION":1, "NEUTRAL":2 }
tokenized_nli_dataset= nli_dataset.map(tokenize_function, batched=True)

print(tokenized_nli_dataset["train"][0])

In [None]:

# DATASET EXPLORATION
print(f"nli_dataset -> {nli_dataset}")

#MODEL TRAINING

training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",                           # save the model
    learning_rate=learning_rate,                  # learning rate
    gradient_checkpointing = True                 # to reduce memory usage
    # fp16 = True                                 # to reduce more memory usage
)


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_nli_dataset["train"],
   eval_dataset=tokenized_nli_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


trainer.train()

In [None]:
trainer.evaluate()