In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import json
import copy
import ast
import random
import time
import torch
from torch import nn, cuda, optim
from torch.utils.data import DataLoader
device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.empty_cache()
print(device)


import string
import nltk
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

try:
    from torchcrf import CRF
except:
    !pip install pytorch-crf
    from torchcrf import CRF

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    classification_report
)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings('ignore')

def generate_random_seed():
    return random.randint(1, 1000)

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

cuda




Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
/kaggle/input/unit-segmentation-lstm-transformers/we.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix1.csv
/kaggle/input/unit-segmentation-lstm-transformers/pe.csv
/kaggle/input/unit-segmentation-lstm-transformers/abam.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix2.csv
/kaggle/input/unit-segmentation-lstm-transformers/ug.csv
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.50d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt


In [2]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [2]:
# get glove embeddings from file
embeddings_index, embeddings_dim = {}, 200
with open('/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 400000 word vectors.


In [3]:
configuration = {
    'train': 'mix1', 
    'test': ['pe', 'we', 'abam', 'ug', 'mix1'],
    'runs': 10,
    'epochs': 20,
    'train_batch_size': 32,
    'dev_batch_size': 32,
    'test_batch_size': 32,
    'label_list': ['O', 'B', 'I'],
    'model_checkpoint': 'glove',
    'crf': True,
    'lr': 1e-4, 
    'hidden_dim': 128
}

In [4]:
"""
Clean sentence
"""
def clean_sentence(text, remove_stop = False):
    tokens = [token.strip().lower() for token in text.split() if not token in string.punctuation]
    if remove_stop:
        tokens = [token for token in tokens if not token in stops]
    return " ".join(tokens)

"""
Remove part of predicted sequences corresponding to padding tokens.
"""
def remove_padding_from_predictions(predictions, batch_attention_mask):
    valid_predictions_list = []
    for instance_preds, att_mask in zip(predictions, batch_attention_mask):
        valid = [pred for pred, mask in zip(instance_preds, att_mask) if mask == 1]
        valid_predictions_list.append(valid)
        
    return valid_predictions_list

def remove_padding_and_get_tokens(batch_input_tokens, batch_attention_mask):
    valid_tokens = []
    for instances_ids, att_mask in zip(batch_input_tokens, batch_attention_mask):
        valid = [ids for ids, mask in zip(instances_ids, att_mask) if mask == 1]
        valid_tokens.append(valid)
    
    return valid_tokens

"""
Maps sequences of integer to sequences of BIO tags
"""
def integer_to_bio(labels, mapping):
    return [[mapping[int(x)] for x in l] for l in labels]

"""
Transforms list of predicted sequences to a flat list of labels.
"""
def flatten_predictions(labels):
    return [j for sub in labels for j in sub]

"""
Generates txt file with tokens, labels and predictions. Estilo FLAiR NLP.
"""
def generate_results_txt(tokens, labels, predictions, output_file_name):
    
    with open(output_file_name, 'w', encoding = 'utf-8') as nf:

        for tks, lbs, prds in zip(tokens, labels, predictions):
            for tk, lb, pr in zip(tks, lbs, prds):
                nf.write(f"{tk} {lb} {pr}\n")

            nf.write(f"\n")

In [5]:
"""
Dataset class for sequence labeling
"""
class SequenceLabelingDataset(torch.utils.data.Dataset):
    def __init__(self, df, label_list, glove_embeddings, embedding_dim = 200, max_sequence_length=128):
        self.max_sequence_length = max_sequence_length
        self.mapping = label_list
        self.glove_embeddings = glove_embeddings
        self.embedding_dim = embedding_dim
        self.labels = [x.split() for x in df.labels.values.tolist()]
        self.encodings = [i.split() for i in df.tokens.values.tolist()]

    def __getitem__(self, idx):
        sentence_tokens = self.encodings[idx]
        sentence_labels = self.labels[idx]
        sentence_labels = list(map(lambda x : self.mapping.index(x), sentence_labels))
        
        sentence_tokens = sentence_tokens[:self.max_sequence_length]
        sentence_labels = sentence_labels[:self.max_sequence_length]
        
        padded_tokens = sentence_tokens + ['<PAD>'] * (self.max_sequence_length - len(sentence_tokens))
        padded_labels = sentence_labels + [0] * (self.max_sequence_length - len(sentence_labels))
        
        # Encode tokens using GloVe embeddings
        embeddings = torch.tensor([self.glove_embeddings.get(token, np.random.rand(self.embedding_dim).astype(np.float32)) for token in padded_tokens])
        
        # Create attention mask
        attention_mask = torch.ones(self.max_sequence_length)
        attention_mask[len(sentence_tokens):] = 0
        
#         print(padded_tokens)
        
        item = {'encodings' : embeddings,
                'mask': attention_mask,
                'tokens': padded_tokens,
               'labels': torch.tensor(padded_labels)}
        return item

    def __len__(self):
        return len(self.labels)
    

class BiLSTMSequenceLabelingModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout, num_labels):
        super(BiLSTMSequenceLabelingModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        self.lstm = nn.LSTM(input_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=dropout)
        self.hidden2tag = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(self.num_labels, batch_first = True)

    def forward(self, x, attention_mask, labels = None):
        
        lstm_out, _ = self.lstm(x)
        
        lstm_out = self.dropout(lstm_out)
        
        logits = self.hidden2tag(lstm_out)
        
        if labels is not None: # training
            loss = -self.crf(logits, labels, mask = attention_mask.byte(), reduction = 'token_mean')
            return loss, self.crf.decode(logits)
        else: # inference
            return self.crf.decode(logits)
        
        return logits

In [6]:
def load_training_data(df_name, TRAIN_BATCH_SIZE = 32, DEV_BATCH_SIZE = 32):
    
    # SEQUENCE LABELING DATASET
    df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
    
#     df['tokens'] = df['tokens'].apply(lambda x: clean_sentence(x))
    
    train_seq_df, dev_seq_df = None, None
    
    if 'dev' in df.set.unique():
        train_seq_df = df.loc[df['set'] == 'train']
        dev_seq_df = df.loc[df['set'] == 'dev']
    else:
        train_seq_df = df.loc[df['set'] == 'train']
        train_seq_df, dev_seq_df = train_test_split(train_seq_df, test_size = 0.1, random_state = 2023)
        
    train_seq_df = train_seq_df.sample(frac = 1)
    dev_seq_df = dev_seq_df.sample(frac = 1)
    
    print(train_seq_df.shape, dev_seq_df.shape)
    
    # PYTORCH DATASETS
    train_dataset = SequenceLabelingDataset(train_seq_df, label_list, embeddings_index)
    val_dataset = SequenceLabelingDataset(dev_seq_df, label_list, embeddings_index)
    
    # DATALOADERS
    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=DEV_BATCH_SIZE, shuffle=True)
    
    return train_loader, val_loader
    
def load_testing_data(df_name, TEST_BATCH_SIZE = 32):

    # SEQUENCE LABELING DATASET
    df = pd.read_csv(f'/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv')
    
#     df['tokens'] = df['tokens'].apply(lambda x: clean_sentence(x))
    
    test_seq_df = df.loc[df['set'] == 'test']
    
    # PYTORCH DATASETS
    test_dataset = SequenceLabelingDataset(test_seq_df, label_list, embeddings_index)
    
    # DATALOADERS
    test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False)
    
    return test_loader

In [7]:
def train_model(model, train_loader, optimizer):

    model.train()

    train_loss = 0
    for batch in train_loader:
        batch_inputs = batch['encodings'].to(device)
        batch_attention_mask = batch['mask'].to(device)
        batch_labels = batch['labels'].to(device)
#         batch_tokens = batch['tokens']
        
        
#         batch = tuple(v for t, v in batch.items())
        loss, outputs = None, None
                
#         batch_inputs, batch_masks, batch_tokens, batch_labels = batch
        loss, outputs = model(batch_inputs, batch_attention_mask, labels = batch_labels)

        train_loss += loss.item()

        # backprop
        optimizer.zero_grad()
        
        loss.backward()
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)

        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    return avg_train_loss

def evaluate_model(model, dataloader):

    model.eval()

    eval_loss = 0
    eval_labels, eval_predictions = [], []
    
    with torch.no_grad():
        for batch in dataloader:
#             batch = tuple(v.to(device) for t, v in batch.items())
            loss, outputs = None, None
            
#             batch_inputs, batch_attention_mask, batch_tokens, batch_labels = batch
            batch_inputs = batch['encodings'].to(device)
            batch_attention_mask = batch['mask'].to(device)
            batch_labels = batch['labels'].to(device)
#             batch_tokens = batch['tokens']

            loss, outputs = model(batch_inputs, batch_attention_mask, labels = batch_labels)
            
            eval_loss += loss.item()
            
            valid_labels = remove_padding_from_predictions(batch_labels.detach().cpu().numpy(), batch_attention_mask.detach().cpu().numpy())
            eval_labels += valid_labels
            
            valid_predictions = remove_padding_from_predictions(outputs, batch_attention_mask.detach().cpu().numpy())
            eval_predictions += valid_predictions
    
    flattened_labels = flatten_predictions(eval_labels)
    flattened_predictions = flatten_predictions(eval_predictions)
    
    eval_f1 = f1_score(flattened_labels, flattened_predictions, average = 'macro')
    return eval_loss / len(dataloader), eval_f1


def test_model(model, dataloader, write_file = False, path_file = None):

    model.eval()

    eval_tokens, eval_labels, eval_predictions = [], [], []

    with torch.no_grad():
        for batch in dataloader:
#             batch = tuple(v.to(device) for t, v in batch.items())

            loss, outputs = None, None
            
#             batch_inputs, batch_attention_mask, batch_tokens, batch_labels = batch
            batch_inputs = batch['encodings'].to(device)
            batch_attention_mask = batch['mask'].to(device)
            batch_labels = batch['labels'].to(device)
            batch_tokens = batch['tokens']

            _, outputs = model(batch_inputs, batch_attention_mask, labels = batch_labels)

            valid_labels = remove_padding_from_predictions(batch_labels.detach().cpu().numpy(), batch_attention_mask.detach().cpu().numpy())
            eval_labels += valid_labels
            
            valid_predictions = remove_padding_from_predictions(outputs, batch_attention_mask.detach().cpu().numpy())
            eval_predictions += valid_predictions
            
            valid_tokens = remove_padding_and_get_tokens(batch_tokens, 
                                                         batch_attention_mask.detach().cpu().numpy())
            eval_tokens += valid_tokens
            
    if write_file:
        generate_results_txt(eval_tokens, eval_labels, eval_predictions, path_file)
    
    return eval_labels, eval_predictions

In [None]:
# MODEL CONFIGURATION
model_checkpoint = configuration['model_checkpoint']
embedding_type = model_checkpoint.split('-')[0]
label_list = configuration['label_list']
num_labels = len(label_list)

# TRAINING CONFIGURATION
RUNS = configuration['runs']
EPOCHS = configuration['epochs']
TRAIN_BATCH_SIZE = configuration['train_batch_size']
TEST_BATCH_SIZE = configuration['test_batch_size']
DEV_BATCH_SIZE = configuration['dev_batch_size']
train_df = configuration['train']
test_dfs = configuration['test']

# MISCELLANEOUS
SAVE_INFORMATION = True
SAVE_BEST_MODEL = True

# ======================================== #
training_info, testing_info, models_info = [], [], []
start_time = time.time()
for nrun in range(RUNS):
    # Initialize
    rs = generate_random_seed()
    set_random_seed(rs)
    
    best_eval_loss = float('inf')
    best_epoch = 0
    best_model_state = None
    
    # Dataloaders
    train_loader, val_loader = load_training_data(train_df, TRAIN_BATCH_SIZE=TRAIN_BATCH_SIZE, DEV_BATCH_SIZE=DEV_BATCH_SIZE)
    
    # Create model
    tagger = BiLSTMSequenceLabelingModel(embeddings_dim, hidden_dim = configuration['hidden_dim'], dropout=0.2, num_labels = num_labels)
    tagger.to(device)
    
    # OPTIMIZER
    optimizer = torch.optim.AdamW(tagger.parameters(), lr = configuration['lr'], eps = 1e-8)
    
    # Training loop
    for epoch in range(EPOCHS):
        print(f"{epoch+1}/{EPOCHS}")
        # train one epoch
        train_loss = train_model(tagger, train_loader, optimizer)
        
        # evaluate model
        eval_loss, eval_f1 = evaluate_model(tagger, val_loader)
        
        training_info.append((nrun, rs, epoch, train_loss, eval_loss, eval_f1)) # nrun, epoch, train_loss, eval_loss, eval_f1
        
        # save best model based on validation loss
        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            best_epoch = epoch
            best_model_state = copy.deepcopy(tagger.state_dict())
            
    print(f"Best epoch: {best_epoch} - Validation loss: {best_eval_loss} [Run: {nrun}]")
    
    # Testing
    # Loading best model
    best_tagger = BiLSTMSequenceLabelingModel(embeddings_dim, hidden_dim = configuration['hidden_dim'], dropout=0.2, num_labels = num_labels)    
    best_tagger.load_state_dict(best_model_state)
    best_tagger.to(device)
    eval_results = evaluate_model(best_tagger, val_loader)
    print(eval_results) # check model
    
    macros = []
    for test_df in test_dfs:
        test_loader = load_testing_data(test_df, TEST_BATCH_SIZE = TEST_BATCH_SIZE)
        path_test_results_file = f'results-{train_df}-{test_df}-{embedding_type}-bilstmcrf-{nrun}.txt'
        tlabels, tpredictions = test_model(best_tagger, test_loader, write_file = True, path_file = path_test_results_file)
        
        flattened_labels = flatten_predictions(tlabels)
        flattened_predictions = flatten_predictions(tpredictions)
        report_info = classification_report(flattened_labels, flattened_predictions, target_names = label_list, output_dict = True)
        accuracy, macro_f1 = report_info['accuracy'], report_info['macro avg']['f1-score']
        o_f1, b_f1, i_f1 =  report_info['O']['f1-score'], report_info['B']['f1-score'], report_info['I']['f1-score']
        macros.append(macro_f1)
    
        testing_info.append((nrun, train_df, test_df, len(tpredictions), accuracy, macro_f1, o_f1, b_f1, i_f1)) # nrun, train, test, sequences, acc, macrof1, Of1, Bf1, If1
    
    print(f"Test Macros F1: {test_dfs}: {macros} [Run: {nrun}]")
    
    if SAVE_BEST_MODEL:
        model_path = f"model-{train_df}-{embedding_type}-bilstmcrf-{nrun}.pt"
        models_info.append((nrun, model_path))
        if best_model_state is not None:
            torch.save(best_model_state, model_path)
        else:
            torch.save(tagger.state_dict(), model_path)
    
# Save data    
if SAVE_INFORMATION:
    
    models_file_name = f"models-{train_df}-{embedding_type}-bilstmcrf.csv"
    pd.DataFrame(models_info, columns = ['run', 'model_file']).to_csv(models_file_name, index = False)
    
    train_file_name = f'train-info-{train_df}-{embedding_type}-bilstmcrf.csv'
    pd.DataFrame(training_info, columns = ['run', 'seed', 'epoch', 'train_loss', 'eval_loss', 'eval_f1']).to_csv(train_file_name, index = False)
    
    test_file_name = f'test-info-{train_df}-{embedding_type}-bilstmcrf.csv'
    pd.DataFrame(testing_info, columns = ['run', 'train', 'test', 'sequences', 'accuracy', 'macro-f1', 'O-f1', 'B-f1', 'I-f1']).to_csv(test_file_name, index = False)

print(f"Total time: {((time.time() - start_time)//60)+1} minutes.")

(9512, 5) (1057, 5)
1/20
2/20
3/20
4/20


In [None]:
import os
import zipfile

def zip_files(folder_path, zip_name):
    # Crear un archivo ZIP
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Recorrer todos los archivos en la carpeta
        for foldername, subfolders, filenames in os.walk(folder_path):
            for filename in filenames:
                # Comprobar si el archivo es un archivo TXT o CSV
                if filename.endswith('.txt') or filename.endswith('.csv') or filename.endswith('.pt'):
                    # Ruta completa del archivo
                    file_path = os.path.join(foldername, filename)
                    # Agregar el archivo al archivo ZIP
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Llamar a la función para comprimir los archivos
folder_path = '/kaggle/working/'
zip_name = 'archivos2.zip'
zip_files(folder_path, zip_name)