# T5 training and evaluation with Pytorch + Accelerate 

In [2]:
import torch
from transformers import AutoTokenizer
from datasets import load_dataset
from accelerate import Accelerator

In [12]:
from transformers import pipeline

model = pipeline("text-classification",
                 model="Yu-yang/bert-finetuned-20newsgroups",
                 tokenizer="bert-base-uncased"
                 )
#model.save_pretrained("./trained_predictors/20newsgroups/models")



In [None]:
import pandas as pd
from datasets import Dataset

def load_42k_hcuch(data_files=None, column_names=['hallazgos', 'impresion', 'nodulos']):
    kwargs = {"encoding" : "utf-8", "sep" : "|", "dtype": {'nodulos' : int}}
    df_list = [pd.read_csv(data_file, **kwargs) for data_file in data_files]
    df = pd.concat(df_list)[column_names]
    df['text'] = df['hallazgos'] # + " " +  df['impresion']
    
    data = Dataset.from_pandas(df).remove_columns(['hallazgos', 'impresion', '__index_level_0__'])
    data = data.rename_column('nodulos', 'label')
    data = data.shuffle(42)
    return data.train_test_split(train_size=.75)


data_files = ["data/42k_HCUCH\labeled_data_3-label_test.csv",
              "data/42k_HCUCH\labeled_data_3-label_train.csv",
              "data/42k_HCUCH/labeled_data_3-label_validation.csv"]

load_42k_hcuch(data_files)

In [None]:
from transformers import BertTokenizerFast



tokenizer = BertTokenizerFast.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
tokenizer.save_pretrained("./trained_predictors/chileanhate/models")

In [15]:
model = pipeline(
    "text-classification",
    model="trained_predictors/42k_HCUCH/models",
    tokenizer="xlm-roberta-large",
    # return_all_scores=True
)



In [9]:
from transformers import AutoModelForSequenceClassification

# Load the model

model_name = "trained_predictors/42k_HCUCH/models"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Extract label mapping
label_mapping = model.config.id2label  # Dictionary: {0: "label1", 1: "label2", ...}

print(label_mapping)  # See available labels


{0: 'nodulos'}


In [16]:
model("testing para nodulos")

[{'label': 'nodulos', 'score': 0.04308806732296944}]

In [7]:
from transformers import XLMRobertaTokenizer, AutoConfig

# Load the model
model_name = "trained_predictors/42k_HCUCH/models_I"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [8]:
import torch
import torch.nn as nn

# Specify your target label index (for example, 2)
target_index = 1

# Extract the weights and bias for the target label
# The original classifier is typically an instance of nn.Linear with shape [num_labels, hidden_size]
old_weights = model.classifier.out_proj.weight.data[target_index].unsqueeze(0)  # shape: [1, hidden_size]
old_bias = model.classifier.out_proj.bias.data[target_index].unsqueeze(0)       # shape: [1]

# Create a new linear layer with a single output
new_classifier = nn.Linear(model.config.hidden_size, 1)

# Set the weights and bias of the new classifier to the extracted values
new_classifier.weight.data = old_weights
new_classifier.bias.data = old_bias

# Create a new linear layer for binary classification.
new_classifier = nn.Linear(model.config.hidden_size, 1)

# Initialize it with the extracted weights and bias.
new_classifier.weight.data = old_weights
new_classifier.bias.data = old_bias

# Replace the projection layer with the new classifier.
model.classifier.out_proj = new_classifier

# Update the model configuration to indicate binary classification.
model.config.num_labels = 1

# Assuming 'model' is your modified model
model.config.id2label = {0: "nodulos"}
model.config.label2id = {"nodulos": 0}


model.save_pretrained("trained_predictors/42k_HCUCH/models")


In [None]:
# We check for cuda use
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print(device)
accelerator = Accelerator()

In [29]:
dataset =  load_dataset('SetFit/20_newsgroups', split='train')
dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 11314
})

In [24]:

# Create a set of the selected label IDs for faster lookup
selected_label_ids = set(range(16))

# Define a filter function
def filter_first_16_labels(example):
    return example['label'] in selected_label_ids

# Apply the filter to each split
filtered_dataset = {
    split: data.filter(filter_first_16_labels)
    for split, data in dataset.items()
}

filtered_dataset


{'train': Dataset({
     features: ['text', 'label', 'label_text'],
     num_rows: 9362
 }),
 'test': Dataset({
     features: ['text', 'label', 'label_text'],
     num_rows: 6231
 })}

In [6]:
def clean_text(example, special_chars=["\n", "\t", "\x85", "\x97", "#", "<br />", "<br/>"]):
    text = example['text']
    for char in special_chars:
        if char in text:
            text = text.replace(char, " ")
    example['text'] = text.encode().lower()
    return example

a = load_dataset("SetFit/20_newsgroups", split='test').filter(lambda example:  example['label'] in set(range(16))).map(clean_text)

Repo card metadata block was not found. Setting CardData to empty.


Filter:   0%|          | 0/7532 [00:00<?, ? examples/s]

Map:   0%|          | 0/6231 [00:00<?, ? examples/s]

In [1]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Input text
input_text = "This is an example of a longer sentence that we want to tokenize and truncate to a maximum length."

# Define the maximum token length
max_length = 10

# Tokenize with truncation
encoded = tokenizer(
    input_text, 
    truncation=True, 
    max_length=max_length, 
    return_tensors="pt"
)

# Decode the truncated tokens back to a string
truncated_text = tokenizer.decode(encoded["input_ids"][0], skip_special_tokens=True)

print("Original text:", input_text)
print("Truncated text:", truncated_text)





Original text: This is an example of a longer sentence that we want to tokenize and truncate to a maximum length.
Truncated text: this is an example of a longer sentence


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small")

input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids

# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
loss.item()

In [5]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# rich: for a better display on terminal
from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
#console = Console(record=True)


## Dataset

In [6]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }


## Train

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


## Validate

In [None]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals


# Generation With BERT

In [None]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer, BertForMaskedLM, BertTokenizerFast, pipeline
import random


def predict_seqs_dict(sequence, model, tokenizer, top_k=5, order="right-to-left"):

    ids_main = tokenizer.encode(sequence, return_tensors="pt", add_special_tokens=False)

    ids_ = ids_main.detach().clone()
    position = torch.where(ids_main == tokenizer.mask_token_id)

    positions_list = position[1].numpy().tolist()

    if order == "left-to-right":
        positions_list.reverse()

    elif order == "random":
        random.shuffle(positions_list)

    # print(positions_list)
    predictions_ids = {}
    predictions_detokenized_sents = {}

    for i in range(len(positions_list)):
        predictions_ids[i] = []
        predictions_detokenized_sents[i] = []

        # if it was the first prediction,
        # just go on and predict the first predictions

        if i == 0:
            model_logits = model(ids_main)["logits"][0][positions_list[0]]
            top_k_tokens = torch.topk(model_logits, top_k, dim=0).indices.tolist()

            for j in range(len(top_k_tokens)):
                # print(j)
                ids_t_ = ids_.detach().clone()
                ids_t_[0][positions_list[0]] = top_k_tokens[j]
                predictions_ids[i].append(ids_t_)

                pred = tokenizer.decode(ids_t_[0])
                predictions_detokenized_sents[i].append(pred)

                # append the sentences and ids of this masked token

        # if we already have some predictions, go on and fill the rest of the masks
        # by continuing the previous predictions
        if i != 0:
            for pred_ids in predictions_ids[i - 1]:

                # get the logits
                model_logits = model(pred_ids)["logits"][0][positions_list[i]]
                # get the top 5 of this prediction and masked token
                top_k_tokens = torch.topk(model_logits, top_k, dim=0).indices.tolist()

                for top_id in top_k_tokens:

                    ids_t_i = pred_ids.detach().clone()
                    ids_t_i[0][positions_list[i]] = top_id

                    pred = tokenizer.decode(ids_t_i[0])

                    # append the sentences and ids of this masked token

                    predictions_ids[i].append(ids_t_i)
                    predictions_detokenized_sents[i].append(pred)

    return predictions_detokenized_sents


sequence = "This is some super neat [MASK] !"
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

#pipe = pipeline(task="fill-mask", tokenizer=tokenizer, model=model)
print(predict_seqs_dict(sequence, model, tokenizer))
#print(pipe(sequence))

In [None]:
import time



model_id = "bert-base-uncased" # "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)
model = BertForMaskedLM.from_pretrained(model_id)

sequence = " i've got as much [MASK] as the next b[MASK] e, and ra[MASK] [MASK] at her [MASK] is [MASK] [MASK] a [MASK] ; [MASK] the fact is that a [MASK] cut-out could act better, and an [MASK] [MASK] [MASK] of ms. w showing off her considerable assets [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] the cast, it's [MASK] that [MASK]'[MASK] [MASK] [MASK] [MASK] it is. i've never been a big fan of [MASK], and his tough guy harry is about as [MASK] as a 9 - dollar bill. godfrey cambridge and [MASK] de sica, both of whom i usually enjoy, seem to be [MASK] through their lines ; and as for edward g... well, i can only assume he was there for the paycheck [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] non - existent [MASK], through stop - start action and unfunny [MASK] to puerile slapstick and [MASK] 60's'caper'music [MASK] [MASK] [MASK] weren't for miss welch, i'd have given it [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] in'bedazzled [MASK] [MASK] [MASK] for that [MASK] alone i gave it a 3."

start = time.time()
results = predict_seqs_dict(sequence, model, tokenizer)
end = time.time()
#pipe = pipeline(task="fill-mask", tokenizer=tokenizer, model=model)
print(f'it took {end-start} seconds')
print(results)
#print(pipe(sequence))