In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import json
import copy
import random
import collections
import time
import torch
from torch import nn, cuda, optim
from torch.utils.data import DataLoader
import torch.nn.init as init
import ast
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.empty_cache()
print(device)
from itertools import islice

from transformers import (
    BertModel,
    BertForTokenClassification,
    BertTokenizerFast,
    AutoTokenizer,
    AutoModelForTokenClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    classification_report
)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings('ignore')

def generate_random_seed():
    return random.randint(1, 1000)

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [2]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, classification_labels, regression_labels):
        self.encodings = encodings
        self.classification_labels = classification_labels
        self.regression_labels = regression_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['classification_labels'] = torch.tensor(self.classification_labels[idx])
        item['regression_labels'] = torch.tensor(self.regression_labels[idx])
        return item

    def __len__(self):
        return len(self.classification_labels)
    
class MTLModel(nn.Module):
    def __init__(self, num_labels, regression_output_dim = 1):
        super(MTLModel, self).__init__()
        
        self.transformer = BertModel.from_pretrained('bert-base-uncased')
        
        # Configuración de la capa de clasificación
        self.classification_layer = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, num_labels)
        )
        
        # Configuración de la capa de regresión
        self.regression_layer = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, regression_output_dim)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.transformer(input_ids, 
                        token_type_ids = token_type_ids, 
                        attention_mask = attention_mask,
                        output_hidden_states = False)
        
        pooled_output = x.pooler_output
        classification_output = self.classification_layer(pooled_output)
        regression_output = self.regression_layer(pooled_output)
 
        return classification_output, regression_output
    
def divide_dataset(df, rs, names_classes):
    filtered_df = df.copy()    
    filtered_df = filtered_df[['PredictedArgument', 'Topic', 'CoarseClass', 'RValue', 'Set']]
    X_train = filtered_df[filtered_df['Set'] == 'train']
    X_test = filtered_df[filtered_df['Set'] == 'test']
    X_train, X_dev = train_test_split(X_train, test_size = 0.10, random_state = rs, stratify = X_train['CoarseClass'])
    return X_train, X_dev, X_test

    
def prepare_input(df, labels_classification, labels_regression, tokenizer, max_len = 128, shuffle = True, bs = 32):
    arguments = df['PredictedArgument'].values
    topics  = df['Topic'].values
    arguments = [str(arg) for arg in arguments]
    topics = [str(tp) for tp in topics]
    
    encodings = tokenizer(list(arguments), list(topics), 
                          truncation=True, 
                          padding='max_length', 
                          max_length=max_len)
    dst = CustomDataset(encodings, labels_classification, labels_regression)
    dataloader = DataLoader(dst, batch_size = bs, shuffle = shuffle)
    return dataloader

def get_batch_predictions(model, batch, model_name = 'bert', classification_loss_fn = None, regression_loss_fn = None):
    # mini-batch predictions
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    input_type_ids = None
    if model_name == 'bert':
        input_type_ids = batch['token_type_ids'].to(device)
    classification_labels = batch['classification_labels'].type(torch.LongTensor).to(device)  # BS x1
    regression_labels = batch['regression_labels'].type(torch.FloatTensor).to(device)
            
    # get predictions for the mini-batch
    classification_outputs, regression_outputs = model(input_ids, attention_mask, input_type_ids) # outputs BS x 2
            
    # calculate mini-batch loss and accuracy
    loss = None
    if classification_loss_fn is not None:
        classification_loss = classification_loss_fn(classification_outputs, classification_labels)
        regression_loss = regression_loss_fn(regression_outputs, regression_labels)
        
        loss = classification_loss + regression_loss
    
    return (classification_outputs, classification_labels), (regression_outputs, regression_labels) , loss

# save model to disc
def save_model_state(model_state, PATH):
    torch.save(model_state, PATH)
    
# load and returns a previously saved model
def load_model(PATH):
    model = MTLModel(NUM_CLASSES)
    model.load_state_dict(torch.load(PATH))
    return model

def transform_label(x, name_classes):
    return name_classes.index(x)

def save_training_information(info, exp):
    cols = info[0]
    pd.DataFrame(info[1:], columns = cols).to_csv(f'results_train_{exp}.csv', index = False)

In [3]:
def finetune_model(model, train_loader, validation_loader, class_loss_fn, regr_loss_fn, optim, epochs, n_samples, n_eval_samples, model_checkpoint):
    best_model_state = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    progress_bar = tqdm(range(len(train_loader) * epochs))
    output_info = []
    for epoch in range(1, epochs + 1):
        
        print("Epoch: {}/{}".format(epoch, epochs))
        start_time = time.time()
        
        ## TRAINING LOOP
        start_time = time.time()
        epoch_info = [epoch]
        train_loss, train_hits = 0, 0
        model.train()
        
        for i, batch in enumerate(train_loader, 0):
            optim.zero_grad()
            
            class_info, regr_info, loss = get_batch_predictions(model, batch, model_checkpoint.split('-')[0], classification_loss_fn = class_loss_fn, regression_loss_fn = regr_loss_fn)
            outputs, labels = class_info
            train_loss += loss.item()
            train_hits += (torch.argmax(outputs, dim = 1) == labels).sum().item()
            
            # Optimization step
            loss.backward()
            
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
            
            # update learning rates using scheduler
            optim.step()
            
            progress_bar.update(1)
          
        # calculate average loss and accuracy
        avg_train_loss = round(float(train_loss / n_samples), 4)
        train_acc = round(float(train_hits / n_samples), 3)
            
        ## VALIDATION LOOP
        model.eval()
        eval_loss = 0
        eval_predictions, eval_labels = np.array([], int), np.array([], int)
        
        with torch.no_grad():
            for batch in validation_loader:
                class_info, regr_info, loss = get_batch_predictions(model, batch, model_checkpoint.split('-')[0], classification_loss_fn = class_loss_fn, regression_loss_fn = regr_loss_fn)
                outputs, labels = class_info
                eval_loss += loss.item()
                
                eval_predictions = np.concatenate((eval_predictions, torch.argmax(outputs, dim = 1).int().cpu().numpy()), axis = 0)
                eval_labels = np.concatenate((eval_labels, labels.int().cpu().numpy()), axis = 0)
                
        # calculate average loss, accuracy and macro f1 score
        avg_eval_loss = round(float(eval_loss / n_eval_samples), 4)
        eval_acc = accuracy_score(eval_labels, eval_predictions)
        eval_macro_f1 = f1_score(eval_labels, eval_predictions, average = 'macro')
        
        ## PRINT INFO
        print(f"Training   ==> Accuracy: {train_acc} | Loss: {avg_train_loss}")
        print(f"Validation ==> Accuracy: {eval_acc}  | Loss: {avg_eval_loss}  | Macro-F1: {eval_macro_f1}")
        
        ## SAVE BEST MODEL
        ## Save model state when validation loss decreases
        if eval_loss < best_loss:
            best_model_state = copy.deepcopy(model.state_dict())
            best_loss = eval_loss
            print("Validation loss has decreased... saving model at epoch {}".format(epoch))
            
        epoch_info += [avg_train_loss, train_acc, avg_eval_loss, eval_acc, eval_macro_f1, n_samples, n_eval_samples]
        output_info.append(epoch_info)
    
    output_info.insert(0, ['epoch', 'avg_train_loss', 'train_acc', 'avg_eval_loss', 'eval_acc', 'eval_macro_f1', 'train_samples', 'eval_samples'])
    return best_model_state, best_loss, output_info

def test_model(path, dataloader, model_checkpoint, classes = None):
    model = load_model(path).to(device)
    model.eval()
    
    predictions, test_labels = np.array([], int), np.array([], int)
    predictions_regr = np.array([])
    test_labels_regr = np.array([])
    logits = np.zeros((0, NUM_CLASSES), dtype=np.float32)
    
    with torch.no_grad():
        for batch in dataloader:
            class_info, regr_info, loss = get_batch_predictions(model, batch, model_checkpoint.split('-')[0])
            outputs, labels = class_info
            outputs_regr, labels_regr = regr_info
                        
            logits = np.concatenate((logits, outputs.cpu().numpy()), axis = 0)
            predictions = np.concatenate((predictions, torch.argmax(outputs, dim = 1).int().cpu().numpy()), axis = 0)
            predictions_regr = np.concatenate((predictions_regr, outputs_regr.cpu().numpy().flatten()), axis = 0)
            test_labels = np.concatenate((test_labels, labels.int().cpu().numpy()), axis = 0)
            test_labels_regr = np.concatenate((test_labels_regr, labels_regr.cpu().numpy()), axis = 0)
    
    return predictions, logits, predictions_regr

In [None]:
MAX_LENGTH = 128
task = 'multi'
df_name = 'generated_ukp'
df = pd.read_csv(f"/kaggle/input/errortypesclassification/{df_name}.csv")

names_classes = ['PM', 'DISP', 'SP', 'MG', 'MU'] if task == 'multi' else ['No-PM', 'PM']
NUM_CLASSES = len(names_classes)

model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

epochs = 10 if task == 'multi' else 5 # number of epochs for each training run 
experiments = 10 # number of models that will be trained and tested.

seeds_df = pd.DataFrame(columns=['r_seed', 'indices', 'test_size'])

best_model_exp = 0
best_model_eval_loss = float('inf')

for exp in range(experiments):
    
    print(f"Running experiment number {exp+1}")
    
    # setup new model, optimizer, loss function and scheduler before training
    # different random seeds are used in each experiment
    r_seed = generate_random_seed()
    set_random_seed(r_seed)
    
    # MODEL
    model = MTLModel(NUM_CLASSES)
    model.to(device)
    
    # LOSS FUNCTION
    classification_loss_fn = nn.CrossEntropyLoss()
    regression_loss_fn = nn.MSELoss()

    optim = torch.optim.AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)
    
    # TRAINING DATA
    X_train, X_dev, X_test = divide_dataset(df, r_seed, names_classes)
    
    current_df = pd.DataFrame({'r_seed': [r_seed], 'indices': [X_test.index.values.tolist()], 'test_size': len(X_test)})
    seeds_df = pd.concat([seeds_df, current_df], ignore_index=True)
    
    y_train_class, y_dev_class, y_test_class = X_train.pop("CoarseClass").values, X_dev.pop("CoarseClass").values, X_test.pop("CoarseClass").values
    y_train_regr, y_dev_regr, y_test_regr = X_train.pop("RValue").values, X_dev.pop("RValue").values, X_test.pop("RValue").values
    train_samples, eval_samples, test_samples = len(y_train_class), len(y_dev_class), len(y_test_class)
    print(train_samples, eval_samples, test_samples)
    
    if task == 'multi':
        y_train_class = [transform_label(x, names_classes) for x in y_train_class]
        y_dev_class = [transform_label(x, names_classes) for x in y_dev_class]
        y_test_class = [transform_label(x, names_classes) for x in y_test_class]
    elif task == 'binary':
        y_train_class = [1 if x == 'PM' else 0 for x in y_train_class]
        y_dev_class = [1 if x == 'PM' else 0 for x in y_dev_class]
        y_test_class = [1 if x == 'PM' else 0 for x in y_test_class]
    
    
    # TRAINING DATA
    train_loader = prepare_input(X_train, y_train_class, y_train_regr, tokenizer, bs = 32, shuffle = True)
    validation_loader = prepare_input(X_dev, y_dev_class, y_dev_regr, tokenizer, bs = 32, shuffle = True)
    test_loader = prepare_input(X_test, y_test_class, y_test_regr, tokenizer, bs = 32, shuffle = False)
    
    best_model, eval_error, train_info = finetune_model(model, train_loader, validation_loader, 
                                classification_loss_fn, regression_loss_fn, optim, epochs, 
                                train_samples, eval_samples, model_checkpoint)
        
    best_model_name = f"trained_model_exp_{exp}.pt"
    save_model_state(best_model, best_model_name)
    save_training_information(train_info, exp)
    
    test_predictions, test_logits, test_regr_predictions = test_model(best_model_name, test_loader, model_checkpoint, classes = names_classes)
    predictions_df = pd.DataFrame()
    predictions_df['true_label'] = y_test_class
    predictions_df['prediction'] = test_predictions
    predictions_df['logits_prediction'] = test_logits.tolist()
    predictions_df['regresion_label'] = y_test_regr
    predictions_df['regresion_prediction'] = test_regr_predictions
    predictions_df.to_csv(f"test_results_exp_{exp}.csv", index = False)
    
    if eval_error < best_model_eval_loss:
        best_model_eval_loss = eval_error
        best_model_exp = exp
    

    print(f"End of experiment number {exp+1}")
    print()
    
seeds_df.to_csv(f"seeds_{exp}.csv", index = False)

In [None]:
import os
import zipfile
from IPython.display import FileLink

model_to_save_name = f'trained_model_exp_{best_model_exp}'
print(model_to_save_name)

def zip_files(folder_path, zip_name):
    # Crear un archivo ZIP
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Recorrer todos los archivos en la carpeta
        for foldername, subfolders, filenames in os.walk(folder_path):
            
            for filename in filenames:
                if ((filename.endswith(('.txt', '.csv'))) and (not filename.startswith("sampling"))) or filename == f"{model_to_save_name}.pt":
                    # Ruta completa del archivo
                    file_path = os.path.join(foldername, filename)
                    # Agregar el archivo al archivo ZIP
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Llamar a la función para comprimir los archivos
folder_path = '/kaggle/working/'
zip_name = 'test.zip'
zip_files(folder_path, zip_name)

FileLink(r'test.zip')