In [None]:
!nvidia-smi

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install transformers
!pip install datasets
!pip install gdown

In [None]:
#download from drive verb_onto_data.zip and unzip

#verbalizaciones base
!gdown '1zKe_gbx-45Wl5pdwk8a5dI9icY38j1-l' 
!unzip verb_onto_data.zip

#verbalizaciones formal
#!gdown '1Dn40nCe29inYqseRcfjCXqzX_mQcL23Y' 
#!unzip verb_onto_data_formal.zip

#verbalizaciones simple
#!gdown '1oVLc490hPmGzktvgdcxvzONqubgF1llF' 
#!unzip verb_onto_data_simple.zip

# **ENTRENAR BERT: OBJETIVO MÁSCARA**

- Descargar tokenizer y modelo
- Leer línea a línea del fichero de texto
- Codificar y enmascarar fichero con verbalizaciones
- Entrenar y guardar modelo entrenado


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import re
import os


In [None]:
model_name = 'bert-base-uncased' 
#model_name = 'bert-large-uncased-whole-word-masking'
#model_name = 'roberta-base'
#model_name = 'roberta-large'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [None]:

data = load_dataset('text', data_files = ['./*.txt'])

In [None]:
data

In [None]:
def preprocess_function(rows):
    inputs = tokenizer(re.sub("_", " ", rows['text']), truncation=True)
    return inputs

In [None]:
encoded_data = data.map(preprocess_function)

In [None]:
data_collator_mlm = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:

batch_size = 8
total_epochs_train = 4

args_train = TrainingArguments(
    output_dir='my_checkpoints', #directorio salida
    overwrite_output_dir=True, #para empezar de nuevo cada vez, no tenemos mucho espacio
    evaluation_strategy="no", # no queremos evaluacion
    save_strategy="epoch", #indicamos que si queremos guardar el modelo, cada vez que se hace una pasada por todos los datos se guarda el modelo para guardar los pesos
    per_device_train_batch_size=batch_size, #cuauntos datos vamos a usar para entrenar
    per_device_eval_batch_size=batch_size*2, #realmente no afecta porque hemos indicado que no evaluamos
    optim="adamw_torch", #optimizador que se usa en la red neuronal para encontrar el minimo
    learning_rate=2e-5, # 1e-5
    weight_decay=0.01, # no cambiar
    warmup_ratio=0.1,# 0.0
    logging_steps=100,
    load_best_model_at_end=False,
    num_train_epochs=total_epochs_train, # numoer de veces que queremos pasr por todos los ratos( cada vez se denomina "epoca"), en este caso hay 3 epocas
    report_to='all', 
    save_total_limit = 1, # cauntos modelos queremos guardar, con esto siempre nos quedamos con el ultimo, si no lo ponemos se guardaran todos los modelos
    )

trainer = Trainer(
    model=model,
    args=args_train,
    data_collator=data_collator_mlm, #prepara los datos para que BERT use la mascara
    train_dataset=encoded_data['train'],
    eval_dataset=None,
    tokenizer=tokenizer,
)
# Train the model
trainer.train()

# **FINE-TUNING: CLASIFICAR**

- Cargar el modelo para clasificar secuencias
- Hacer fine-tuning con datos de entrenamiento
- Clasificar datos de test y calcular f1

In [None]:
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset, load_metric
import re
import os
from sklearn.metrics import confusion_matrix, classification_report
del model
del trainer

In [None]:
!gdown '1nf5loPW7MhcJA4g48Pt2xmLMwXWuoH-T'
!unzip tripletaCSV_1.zip

#!gdown '1i1WNaRAtKmQI4QTXvGk7DUiVTnWH3euO'
#!unzip tripletaCSV_2.zip

#!gdown '1sJMWhzPjApRZmHBE-Qu1Khsb4Iu9Zle3'
#!unzip tripletaCSV_3.zip

#!gdown '1AaS5ll2jtFFIH7z1BMcCnMUDyb74qc7H'
#!unzip tripletaCSV_4.zip

#!gdown '1QyOjErHCU8iC6T5wm9mGC9VTCoZpkNgD'
#!unzip tripletaCSV_5.zip

In [None]:
data_class = load_dataset('csv', data_files={'train':['train.csv'], 'val': ['val.csv'], 'test':['test.csv']})
data_class

In [None]:
def verb_row(row, template, tokenizer):
    w1 = re.sub("_", " ", str(row['source']))
    w2 = re.sub("_", " ", str(row['target']))
    label=int(row['rel'])
    sentence = re.sub("<W1>", w1, template)
    sentence = re.sub("<W2>", w2, sentence)
    sentence = re.sub("<SEP>", tokenizer.sep_token, sentence)
    return {'text':sentence, 'labels':label}

In [None]:
template = "'<W1>' <SEP> '<W2>'"
data_class_v = data_class.map(verb_row, 
                   fn_kwargs={'template':template, 'tokenizer':tokenizer}, 
                   remove_columns=['rel','source','target'])
data_class_v

In [None]:
data_class_v['train'][0]

In [None]:
encoded_data_class = data_class_v.map(preprocess_function, remove_columns=['text'])
encoded_data_class

In [None]:
list_dir = os.listdir(args_train.output_dir)
list_dir.sort()
trained_model_checkpoint = args_train.output_dir + "/" + list_dir[0]


In [None]:
def compute_metrics(eval_pred):
    '''
Compute metrics for a Trainer.

Args:
  eval_pred: object of type transformers.EvalPrediction. It is a tuple with 
  predictions (logits) and real labels.

Returns:
  A dictionary of metrics {'name_metric1':value1,...}
'''
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
encoded_data_class['train'][0:2]

In [None]:
from random import randint
TOTAL = 10
seeds = [randint(1,100) for i in range(TOTAL)]
while len(seeds) != len(set(seeds)):
    seeds = [randint(1,100) for i in range(TOTAL)]

metric_name = "f1"
metric = load_metric(metric_name)
batch_size = 8
total_epochs = 3

precision_verb = []
precision_no_verb = []
recall_verb = []
recall_no_verb = []
f1_verb = []
f1_no_verb = []
for i in range(TOTAL):
    model_class = AutoModelForSequenceClassification.from_pretrained(trained_model_checkpoint, num_labels=2)
    seed = seeds[i]

    args_train_class = TrainingArguments(
        output_dir='my_checkpoints_class',
        overwrite_output_dir=True,
        evaluation_strategy="epoch", 
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size*2,
        optim="adamw_torch",
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        logging_steps=100,
        load_best_model_at_end=True,
        num_train_epochs=total_epochs,
        metric_for_best_model=metric_name,
        seed=seed,
        report_to='all',
        save_total_limit = 3,
        )

    trainer = Trainer(
        model=model_class,
        args=args_train_class,
        train_dataset=encoded_data_class['train'],
        eval_dataset=encoded_data_class['val'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )

    # Train the mode
    trainer.train()

    predicciones = trainer.predict(test_dataset=encoded_data_class['test'])

    #calculate the predicted labels 0/1 based on the field predictions of the object predicciones
    #predicciones.predictions contains the logits
    pred = np.argmax(predicciones.predictions, axis = 1)
    f1_verb.append(metric.compute(predictions=pred, references=predicciones.label_ids)['f1'])
    results_acc = (classification_report(predicciones.label_ids, pred, digits=4, output_dict=True))
    precision_verb.append(results_acc['1']['precision'])
    recall_verb.append(results_acc['1']['recall'])
    print(metric.compute(predictions=pred, references=predicciones.label_ids))
    print(confusion_matrix(predicciones.label_ids,y_pred =pred))


    model_class_bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    args_train_class_bert = TrainingArguments(
        output_dir='my_checkpoints_class_bert',
        overwrite_output_dir=True,
        evaluation_strategy="epoch", 
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size*2,
        optim="adamw_torch",
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        logging_steps=100,
        load_best_model_at_end=True,
        num_train_epochs=total_epochs,
        metric_for_best_model=metric_name,
        seed=seed,
        report_to='all',
        save_total_limit = 3,
        )

    trainer_bert = Trainer(
        model=model_class_bert,
        args=args_train_class_bert,
        train_dataset=encoded_data_class['train'],
        eval_dataset=encoded_data_class['val'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )

    # Train the model
    trainer_bert.train()

    predicciones = trainer_bert.predict(test_dataset=encoded_data_class['test'])

    #calculate the predicted labels 0/1 based on the field predictions of the object predicciones
    #predicciones.predictions contains the logits
    pred = np.argmax(predicciones.predictions, axis = 1)
    f1_no_verb.append(metric.compute(predictions=pred, references=predicciones.label_ids)['f1'])
    results_acc = (classification_report(predicciones.label_ids, pred, digits=4, output_dict=True))
    precision_no_verb.append(results_acc['1']['precision'])
    recall_no_verb.append(results_acc['1']['recall'])
    print(metric.compute(predictions=pred, references=predicciones.label_ids))
    print(confusion_matrix(predicciones.label_ids,y_pred =pred))
    print('iteracion numero ' + str(i) )

In [None]:
print(np.mean(f1_verb))
print(np.mean(f1_no_verb))

In [None]:
print(np.std(f1_verb))
print(np.std(f1_no_verb))

In [None]:
from statistics import mean

f = open('results_datasets_1.tsv', 'w')
for i in range(TOTAL):
    f.write(str(i) + '\t' + model_name + '\t' + str(total_epochs_train) + '\t' + str(total_epochs) + 
            '\t' +"datasets_1" + '\t' + 'formal' + '\t' +  'CON_VERB' + 
            '\t' + str(precision_verb[i]) + '\t' + str(recall_verb[i]) + '\t' + str(f1_verb[i]) + '\n')
    f.write(str(i) + '\t' + model_name + '\t' + str(total_epochs_train) + '\t' + str(total_epochs) + 
            '\t' +"datasets_1" + '\t' + 'formal' + '\t' +  'SIN_VERB' + 
            '\t' + str(precision_no_verb[i]) + '\t' + str(recall_no_verb[i]) + '\t' + str(f1_no_verb[i]) + '\n\n')
f.write('Media total: ' + '\t' +  'CON_VERB' + '\t' +  str(mean(precision_verb)) + '\t' + str(mean(recall_verb)) + '\t' + str(mean(f1_verb)) + '\n')
f.write('Media total: ' + '\t' +  'SIN_VERB' + '\t' +  str(mean(precision_no_verb)) + '\t' + str(mean(recall_no_verb)) + '\t' + str(mean(f1_no_verb)) + '\n')
f.close()

# **BUCLE GRANDE AUTOMATIZAR**

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

!pip install transformers
!pip install datasets
!pip install gdown

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import re
import os

from transformers import AutoModelForSequenceClassification
from datasets import load_dataset, load_metric
import re
import os
from sklearn.metrics import confusion_matrix, classification_report
from random import randint
from statistics import mean


In [None]:
def verb_row(row, template, tokenizer):
    w1 = re.sub("_", " ", str(row['source']))
    w2 = re.sub("_", " ", str(row['target']))
    label=int(row['rel'])
    sentence = re.sub("<W1>", w1, template)
    sentence = re.sub("<W2>", w2, sentence)
    sentence = re.sub("<SEP>", tokenizer.sep_token, sentence)
    return {'text':sentence, 'labels':label}

def compute_metrics(eval_pred):
    '''
Compute metrics for a Trainer.
​
Args:
  eval_pred: object of type transformers.EvalPrediction. It is a tuple with 
  predictions (logits) and real labels.
​
Returns:
  A dictionary of metrics {'name_metric1':value1,...}
'''
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return metric.compute(predictions=predictions, references=labels)


# **DATOS**

In [None]:
tiposVerbalizacion = ['base', 'formal', 'simple' ]

#verbalizaciones base
#tipoVerbalizacion = 'base'
!gdown '1zKe_gbx-45Wl5pdwk8a5dI9icY38j1-l' 


#verbalizaciones formal
#tipoVerbalizacion = 'formal'
!gdown '1Dn40nCe29inYqseRcfjCXqzX_mQcL23Y' 


#verbalizaciones simple
#tipoVerbalizacion = 'simple'
!gdown '1oVLc490hPmGzktvgdcxvzONqubgF1llF' 

def getVerbalizaciones(verbalizacion):
    if verbalizacion == 'base':
        !unzip -o verb_onto_data.zip
    if verbalizacion == 'formal':
        !unzip -o verb_onto_data_formal.zip
    if verbalizacion == 'simple':
        !unzip -o verb_onto_data_simple.zip


tiposModelos = ['bert-large', 'roberta-base', 'roberta-large']
    
#model_name = 'bert-base-uncased' 
#model_name = 'bert-large-uncased-whole-word-masking'
#model_name = 'roberta-base'
#model_name = 'roberta-large'


tripletas = ['tripleta1', 'tripleta2', 'tripleta3', 'tripleta4', 'tripleta5']
!gdown '1nf5loPW7MhcJA4g48Pt2xmLMwXWuoH-T'
!gdown '1i1WNaRAtKmQI4QTXvGk7DUiVTnWH3euO'
!gdown '1sJMWhzPjApRZmHBE-Qu1Khsb4Iu9Zle3'
!gdown '1AaS5ll2jtFFIH7z1BMcCnMUDyb74qc7H'
!gdown '1QyOjErHCU8iC6T5wm9mGC9VTCoZpkNgD'

def getTripletaCSV(tripleta):
    
    if tripleta == 'tripleta1':
        !unzip -o tripletaCSV_1.zip

    if tripleta == 'tripleta2':
        !unzip -o tripletaCSV_2.zip

    if tripleta == 'tripleta3':  
        !unzip -o tripletaCSV_3.zip

    if tripleta == 'tripleta4':
        !unzip -o tripletaCSV_4.zip

    if tripleta == 'tripleta5':
        !unzip -o tripletaCSV_5.zip
        

In [None]:
import shutil
shutil.rmtree("/kaggle/working/my_checkpoints_class")
shutil.rmtree("/kaggle/working/my_checkpoints")
shutil.rmtree("/kaggle/working/my_checkpoints_class_bert")

In [None]:
import shutil
tipoTripleta = 1
getTripletaCSV(tripletas[0])
for modelo in tiposModelos:
    model_name = modelo
    for verbalizacion in tiposVerbalizacion:
        getVerbalizaciones(verbalizacion)
        
        numEpocas = 3

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForMaskedLM.from_pretrained(model_name)

        data = load_dataset('text', data_files = ['./*.txt'])

        def preprocess_function(rows):
            inputs = tokenizer(re.sub("_", " ", rows['text']), truncation=True)
            return inputs

        encoded_data = data.map(preprocess_function)

        data_collator_mlm = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15
        )







        # Entrenamiento 1

        batch_size = 8
        total_epochs_train = numEpocas
        args_train = TrainingArguments(
            output_dir='my_checkpoints', #directorio salida
            overwrite_output_dir=True, #para empezar de nuevo cada vez, no tenemos mucho espacio
            evaluation_strategy="no", # no queremos evaluacion
            save_strategy="epoch", #indicamos que si queremos guardar el modelo, cada vez que se hace una pasada por todos los datos se guarda el modelo para guardar los pesos
            per_device_train_batch_size=batch_size, #cuauntos datos vamos a usar para entrenar
            per_device_eval_batch_size=batch_size*2, #realmente no afecta porque hemos indicado que no evaluamos
            optim="adamw_torch", #optimizador que se usa en la red neuronal para encontrar el minimo
            learning_rate=2e-5, # 1e-5
            weight_decay=0.01, # no cambiar
            warmup_ratio=0.1,# 0.0
            logging_steps=100,
            load_best_model_at_end=False,
            num_train_epochs=total_epochs_train, # numoer de veces que queremos pasr por todos los ratos( cada vez se denomina "epoca"), en este caso hay 3 epocas
            report_to='all', 
            save_total_limit = 1, # cauntos modelos queremos guardar, con esto siempre nos quedamos con el ultimo, si no lo ponemos se guardaran todos los modelos
            )
        trainer = Trainer(
            model=model,
            args=args_train,
            data_collator=data_collator_mlm, #prepara los datos para que BERT use la mascara
            train_dataset=encoded_data['train'],
            eval_dataset=None,
            tokenizer=tokenizer,
        )
        # Train the model
        trainer.train()












        del model
        del trainer

        data_class = load_dataset('csv', data_files={'train':['train.csv'], 'val': ['val.csv'], 'test':['test.csv']})


        template = "'<W1>' <SEP> '<W2>'"
        data_class_v = data_class.map(verb_row, 
                           fn_kwargs={'template':template, 'tokenizer':tokenizer}, 
                           remove_columns=['rel','source','target'])

        encoded_data_class = data_class_v.map(preprocess_function, remove_columns=['text'])

        list_dir = os.listdir(args_train.output_dir)
        list_dir.sort()
        trained_model_checkpoint = args_train.output_dir + "/" + list_dir[0]











        # Entrenamiento 2

        TOTAL = 10
        seeds = [randint(1,100) for i in range(TOTAL)]
        while len(seeds) != len(set(seeds)):
            seeds = [randint(1,100) for i in range(TOTAL)]
        metric_name = "f1"
        metric = load_metric(metric_name)
        batch_size = 8
        total_epochs = 3
        precision_verb = []
        precision_no_verb = []
        recall_verb = []
        recall_no_verb = []
        f1_verb = []
        f1_no_verb = []
        for i in range(TOTAL):
            model_class = AutoModelForSequenceClassification.from_pretrained(trained_model_checkpoint, num_labels=2)
            seed = seeds[i]
            args_train_class = TrainingArguments(
                output_dir='my_checkpoints_class',
                overwrite_output_dir=True,
                evaluation_strategy="epoch", 
                save_strategy="epoch",
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size*2,
                optim="adamw_torch",
                learning_rate=2e-5,
                weight_decay=0.01,
                warmup_ratio=0.1,
                logging_steps=100,
                load_best_model_at_end=True,
                num_train_epochs=total_epochs,
                metric_for_best_model=metric_name,
                seed=seed,
                report_to='all',
                save_total_limit = 3,
                )
            trainer = Trainer(
                model=model_class,
                args=args_train_class,
                train_dataset=encoded_data_class['train'],
                eval_dataset=encoded_data_class['val'],
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
                )
            # Train the mode
            trainer.train()
            predicciones = trainer.predict(test_dataset=encoded_data_class['test'])
            #calculate the predicted labels 0/1 based on the field predictions of the object predicciones
            #predicciones.predictions contains the logits
            pred = np.argmax(predicciones.predictions, axis = 1)
            f1_verb.append(metric.compute(predictions=pred, references=predicciones.label_ids)['f1'])
            results_acc = (classification_report(predicciones.label_ids, pred, digits=4, output_dict=True))
            precision_verb.append(results_acc['1']['precision'])
            recall_verb.append(results_acc['1']['recall'])
            print(metric.compute(predictions=pred, references=predicciones.label_ids))
            print(confusion_matrix(predicciones.label_ids,y_pred =pred))
            model_class_bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
            args_train_class_bert = TrainingArguments(
                output_dir='my_checkpoints_class_bert',
                overwrite_output_dir=True,
                evaluation_strategy="epoch", 
                save_strategy="epoch",
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size*2,
                optim="adamw_torch",
                learning_rate=2e-5,
                weight_decay=0.01,
                warmup_ratio=0.1,
                logging_steps=100,
                load_best_model_at_end=True,
                num_train_epochs=total_epochs,
                metric_for_best_model=metric_name,
                seed=seed,
                report_to='all',
                save_total_limit = 3,
                )

            trainer_bert = Trainer(
                model=model_class_bert,
                args=args_train_class_bert,
                train_dataset=encoded_data_class['train'],
                eval_dataset=encoded_data_class['val'],
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
                )

            # Train the model
            trainer_bert.train()

            predicciones = trainer_bert.predict(test_dataset=encoded_data_class['test'])

            #calculate the predicted labels 0/1 based on the field predictions of the object predicciones
            #predicciones.predictions contains the logits
            pred = np.argmax(predicciones.predictions, axis = 1)
            f1_no_verb.append(metric.compute(predictions=pred, references=predicciones.label_ids)['f1'])
            results_acc = (classification_report(predicciones.label_ids, pred, digits=4, output_dict=True))
            precision_no_verb.append(results_acc['1']['precision'])
            recall_no_verb.append(results_acc['1']['recall'])
            print(metric.compute(predictions=pred, references=predicciones.label_ids))
            print(confusion_matrix(predicciones.label_ids,y_pred =pred))
            print('iteracion numero ' + str(i) )

        f = open( 'results_' + str(model_name) + '_' + str(numEpocas) + '_' + str(verbalizacion) + '_' + str(tipoTripleta) + '.tsv', 'w')
        for i in range(TOTAL):
            f.write(str(i) + '\t' + model_name + '\t' + str(total_epochs_train) + '\t' + str(total_epochs) + 
                    '\t' +"datasets_1" + '\t' + 'formal' + '\t' +  'CON_VERB' + 
                    '\t' + str(precision_verb[i]) + '\t' + str(recall_verb[i]) + '\t' + str(f1_verb[i]) + '\n')
            f.write(str(i) + '\t' + model_name + '\t' + str(total_epochs_train) + '\t' + str(total_epochs) + 
                    '\t' +"datasets_1" + '\t' + 'formal' + '\t' +  'SIN_VERB' + 
                    '\t' + str(precision_no_verb[i]) + '\t' + str(recall_no_verb[i]) + '\t' + str(f1_no_verb[i]) + '\n\n')
        f.write('Media total: ' + '\t' +  'CON_VERB' + '\t' +  str(mean(precision_verb)) + '\t' + str(mean(recall_verb)) + '\t' + str(mean(f1_verb)) + '\n')
        f.write('Media total: ' + '\t' +  'SIN_VERB' + '\t' +  str(mean(precision_no_verb)) + '\t' + str(mean(recall_no_verb)) + '\t' + str(mean(f1_no_verb)) + '\n')
        f.close()
        
        shutil.rmtree("/kaggle/working/my_checkpoints_class")
        shutil.rmtree("/kaggle/working/my_checkpoints")
        shutil.rmtree("/kaggle/working/my_checkpoints_class_bert")
        
        