In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import torch
import ast
from torch import nn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import copy
import time
import json
import random
import math
from tqdm.auto import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from transformers import AutoTokenizer, DistilBertModel, BertModel, AutoModel
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from torch.optim import AdamW, Adam

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.empty_cache()
print(device)


import warnings
warnings.filterwarnings("ignore")

from transformers import logging
logging.set_verbosity_error()


def generate_random_seed():
    return random.randint(1, 1000)

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [2]:
# save model to disc
def save_model_state(model_state, PATH):
    torch.save(model_state, PATH)
    
# load and returns a previously saved model
def load_model(PATH, model_checkpoint):
    model = CustomClassifier(NUM_CLASSES, model_checkpoint)
    model.load_state_dict(torch.load(PATH))
    return model
    
def transform_label(x, name_classes):
    return name_classes.index(x)

def save_training_information(info, exp):
    cols = info[0]
    pd.DataFrame(info[1:], columns = cols).to_csv(f'results_train_{exp}.csv', index = False)

# dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, predictions, labels):
        self.encodings = encodings
        self.predictions = predictions
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['predictions'] = torch.tensor(self.predictions[idx])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class CustomClassifier(nn.Module):
    def __init__(self, num_labels, model_checkpoint):
        print(model_checkpoint)
        super(CustomClassifier, self).__init__()
        self.num_labels = num_labels
        self.transformer = AutoModel.from_pretrained(model_checkpoint)
        self.classifier = nn.Linear(768 * 3, self.num_labels) 
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, input_ids, attention_mask, token_type_ids, stance_predictions):
        if token_type_ids is not None:
            x = self.transformer(input_ids, 
                            token_type_ids = token_type_ids, 
                            attention_mask = attention_mask,
                            output_hidden_states = False)
            x = x.pooler_output
        else:
            x = self.transformer(input_ids, 
                            attention_mask = attention_mask,
                            output_hidden_states = False)
            x = x.last_hidden_state
            x = x.mean(dim = 1)
            
        x_expanded = x.unsqueeze(1).repeat(1, 3, 1) 
        x_expanded = x_expanded.view(x.size(0), -1)
        stance_expanded = stance_predictions.repeat_interleave(768, dim=1)
        x = x_expanded * stance_expanded
                
        hidden = self.dropout(nn.LeakyReLU()(x))
         
        output = self.classifier(hidden)
        return output
    
def prepare_input_with_topic(df, labels, tokenizer, max_len = 128, shuffle = True, bs = 32):
        
    arguments = df['Sentence1'].values
    topics  = df['Sentence2'].values
    
    predictions = pd.get_dummies(df['predicted'])
    predictions = predictions.astype(int).values.tolist()

    arguments = [str(arg) for arg in arguments]
    topics = [str(tp) for tp in topics]
    
    encodings = tokenizer(list(arguments), list(topics), 
                          truncation=True, 
                          padding='max_length', 
                          max_length=max_len)
    
    dst = CustomDataset(encodings, predictions, labels)
    dataloader = DataLoader(dst, batch_size = bs, shuffle = shuffle)
    return dataloader


def divide_dataset(df, rs):
    filtered_df = df.copy()
    filtered_df = filtered_df[['Set', 'Sentence1', 'Sentence2', 'predicted', 'error_type']]
    
    X_train = filtered_df[filtered_df['Set'] == 'train']
    X_test = filtered_df[filtered_df['Set'] == 'test']
    X_dev = filtered_df[filtered_df['Set'] == 'dev']
    
    return X_train, X_dev, X_test

def get_batch_predictions(model, batch, model_name = 'bert', loss_fn = None):
    # mini-batch predictions
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    input_type_ids = batch['token_type_ids'].to(device) if model_name == 'bert' else None
    
    stance_predictions = batch['predictions'].to(device)
    labels = batch['labels'].type(torch.LongTensor).to(device)  # BS x1
            
    # get predictions for the mini-batch
    outputs = model(input_ids, attention_mask, input_type_ids, stance_predictions)
            
    # calculate mini-batch loss and accuracy
    loss = None
    if loss_fn is not None:
        loss = loss_fn(outputs, labels)
    
    return outputs, labels, loss

def finetune_model(model, train_loader, validation_loader, loss_fn, optim, epochs, n_samples, n_eval_samples, model_checkpoint):
    best_model_state = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    output_info = []
    for epoch in range(1, epochs + 1):
        
        print("Epoch: {}/{}".format(epoch, epochs))
        start_time = time.time()
        epoch_info = [epoch]
        
        ## TRAINING LOOP
        start_time = time.time()
        train_loss, train_hits = 0, 0
        model.train()
        
        for i, batch in enumerate(train_loader, 0):
            optim.zero_grad()
            
            outputs, labels, loss = get_batch_predictions(model, batch, model_checkpoint.split('-')[0], loss_fn)
            
            train_loss += loss.item()
            train_hits += (torch.argmax(outputs, dim = 1) == labels).sum().item()
            
            # Optimization step
            loss.backward()
            
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
            
            # update learning rates using scheduler
            optim.step()
                      
        # calculate average loss and accuracy
        avg_train_loss = round(float(train_loss / n_samples), 4)
        train_acc = round(float(train_hits / n_samples), 3)
            
        ## VALIDATION LOOP
        model.eval()
        eval_loss = 0
        eval_predictions, eval_labels = np.array([], int), np.array([], int)
        
        with torch.no_grad():
            for batch in validation_loader:
                outputs, labels, loss = get_batch_predictions(model, batch, model_checkpoint.split('-')[0], loss_fn)
                eval_loss += loss.item()
                
                eval_predictions = np.concatenate((eval_predictions, torch.argmax(outputs, dim = 1).int().cpu().numpy()), axis = 0)
                eval_labels = np.concatenate((eval_labels, labels.int().cpu().numpy()), axis = 0)
                
        # calculate average loss, accuracy and macro f1 score
        avg_eval_loss = round(float(eval_loss / n_eval_samples), 4)
        eval_acc = accuracy_score(eval_labels, eval_predictions)
        eval_macro_f1 = f1_score(eval_labels, eval_predictions, average = 'macro')
        
        ## PRINT INFO
        print(f"Training   ==> Accuracy: {train_acc} | Loss: {avg_train_loss}")
        print(f"Validation ==> Accuracy: {eval_acc}  | Loss: {avg_eval_loss}  | Macro-F1: {eval_macro_f1}")
        
        ## SAVE BEST MODEL
        ## Save model state when validation loss decreases
        if avg_eval_loss < best_loss:
            best_model_state = copy.deepcopy(model.state_dict())
            best_loss = avg_eval_loss
            print("Validation loss has decreased... saving model at epoch {}".format(epoch))
            
        epoch_info += [avg_train_loss, train_acc, avg_eval_loss, eval_acc, eval_macro_f1, n_samples, n_eval_samples]
        output_info.append(epoch_info)
    
    output_info.insert(0, ['epoch', 'avg_train_loss', 'train_acc', 'avg_eval_loss', 'eval_acc', 'eval_macro_f1', 'train_samples', 'eval_samples'])
    return best_model_state, best_loss, output_info

def test_model(path, dataloader, model_checkpoint, classes = None):
    model = load_model(path, model_checkpoint).to(device)
    model.eval()
    
    predictions, test_labels = np.array([], int), np.array([], int)
    logits = np.zeros((0, NUM_CLASSES), dtype=np.float32)
    
    with torch.no_grad():
        for batch in dataloader:
            outputs, labels, loss = get_batch_predictions(model, batch, model_checkpoint.split('-')[0])
            
            logits = np.concatenate((logits, outputs.cpu().numpy()), axis = 0)
            predictions = np.concatenate((predictions, torch.argmax(outputs, dim = 1).int().cpu().numpy()), axis = 0)
            test_labels = np.concatenate((test_labels, labels.int().cpu().numpy()), axis = 0)
    
    return predictions, logits
    


In [None]:
MAX_LENGTH = 128
task = 'binary'

df_name = 'generated_ibm_sample1'
df = pd.read_csv(f"/kaggle/input/error-type-predictions-in-argumentative-relations/{df_name}.csv")

names_classes = ['correct', 'flipped', 'neutralized', 'polarized'] if task == 'multi' else ['no-correct', 'correct']
NUM_CLASSES = len(names_classes)
print(names_classes, NUM_CLASSES)

model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

epochs = 10
experiments = 10 # number of models that will be trained and tested.

seeds_df = pd.DataFrame(columns=['r_seed', 'indices', 'test_size'])

best_model_exp = 0
best_model_eval_loss = float('inf')

for exp in range(experiments):
    
    print(f"Running experiment number {exp}")
    
    # setup new model, optimizer, loss function and scheduler before training
    # different random seeds are used in each experiment
    r_seed = generate_random_seed()
    set_random_seed(r_seed)
    
    # MODEL
    model = CustomClassifier(NUM_CLASSES, model_checkpoint)
    model.to(device)
    
    # LOSS FUNCTION
    loss_fn = nn.CrossEntropyLoss()
    ### Optimizer
    optim = torch.optim.AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)
    
    # TRAINING DATA
    X_train, X_dev, X_test = divide_dataset(df, r_seed)
        
    current_df = pd.DataFrame({'r_seed': [r_seed], 'indices': [X_test.index.values.tolist()], 'test_size': len(X_test)})
    seeds_df = pd.concat([seeds_df, current_df], ignore_index=True)
    
    y_train, y_dev, y_test = X_train.pop("error_type").values, X_dev.pop("error_type").values, X_test.pop("error_type").values
    train_samples, eval_samples, test_samples = len(y_train), len(y_dev), len(y_test)
    print(train_samples, eval_samples, test_samples)
    
    if task == 'multi':
        y_train = [transform_label(x, names_classes) for x in y_train]
        y_dev = [transform_label(x, names_classes) for x in y_dev]
        y_test = [transform_label(x, names_classes) for x in y_test]
    elif task == 'binary':
        y_train = [1 if x == 'correct' else 0 for x in y_train]
        y_dev = [1 if x == 'correct' else 0 for x in y_dev]
        y_test = [1 if x == 'correct' else 0 for x in y_test]
    
    train_loader = prepare_input_with_topic(X_train, y_train, tokenizer, bs = 32, shuffle = True) 
    validation_loader = prepare_input_with_topic(X_dev, y_dev, tokenizer, bs = 32, shuffle = True)
    test_loader = prepare_input_with_topic(X_test, y_test, tokenizer, bs = 32, shuffle = False)
    
    best_model, eval_error, train_info = finetune_model(model, train_loader, validation_loader, 
                                loss_fn, optim, epochs, train_samples, eval_samples, model_checkpoint)
        
    best_model_name = f"trained_model_exp_{exp}.pt"
    save_model_state(best_model, best_model_name)
    save_training_information(train_info, exp)
    
    test_predictions, test_logits = test_model(best_model_name, test_loader, model_checkpoint, classes = names_classes)
    predictions_df = pd.DataFrame()
    predictions_df['true_label'] = y_test
    predictions_df['prediction'] = test_predictions
    predictions_df['logits_prediction'] = test_logits.tolist()
    predictions_df.to_csv(f"test_results_exp_{exp}.csv", index = False)
    
    if eval_error < best_model_eval_loss:
        best_model_eval_loss = eval_error
        best_model_exp = exp    

    print(f"End of experiment number {exp}")
    print()
    
seeds_df.to_csv(f"seeds_{exp}.csv", index = False)

In [None]:
import os
import zipfile
from IPython.display import FileLink

model_to_save_name = f'trained_model_exp_{best_model_exp}'
print(model_to_save_name)

def zip_files(folder_path, zip_name):
    # Crear un archivo ZIP
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Recorrer todos los archivos en la carpeta
        for foldername, subfolders, filenames in os.walk(folder_path):
            
            for filename in filenames:
                if ((filename.endswith(('.txt', '.csv'))) and (not filename.startswith("sampling"))) or filename == f"{model_to_save_name}.pt":
                    # Ruta completa del archivo
                    file_path = os.path.join(foldername, filename)
                    # Agregar el archivo al archivo ZIP
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))

folder_path = '/kaggle/working/'
zip_name = 'test.zip'
zip_files(folder_path, zip_name)

FileLink(r'test.zip')