In [1]:
from datasets import load_dataset , Dataset
from transformers import AutoModelForSeq2SeqLM, LongT5ForConditionalGeneration, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
import time
import evaluate
import pandas as pd
import numpy as np

train_file_path = "../../data/train_research.csv"

In [37]:
dict_model ={}
dict_model["model_ckpt_longt5_globalbase"] = "google/long-t5-tglobal-base"

In [38]:
# Getting Data

def getDataNRows(filename,cleanCol, n=100):
    """
    Gets the data in dataset format for n rows of  csv file

    filename: full path to the csv file

    n: Number of Rows needed

    """

    d = pd.read_csv(filename)
    d = d.head(n)
    dataset = Dataset.from_pandas(d)

    return dataset


#Getting the data ready
data = getDataNRows(train_file_path,"sections",1000)

In [5]:
data

Dataset({
    features: ['sections', 'abstract', 'summary'],
    num_rows: 500
})

In [25]:
# Splitting the data
ds = data.train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['sections', 'abstract', 'summary'],
        num_rows: 400
    })
    test: Dataset({
        features: ['sections', 'abstract', 'summary'],
        num_rows: 100
    })
})

In [92]:
#Loading the Model
def get_tokenzier_model(model_ckpt):

    """
    returns the tokenizer and the model for a specific model checkpoint

    model_ckpt: model checkpoint name

    """
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    if model_ckpt == "google/long-t5-tglobal-base":
      model = LongT5ForConditionalGeneration.from_pretrained(model_ckpt, torch_dtype=torch.bfloat16)
      print("success")
    else:
      model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

    return model, tokenizer

model, tokenizer = get_tokenzier_model(dict_model["model_ckpt_longt5_globalbase"])

success


In [40]:
import os
def print_number_model_params(model):
    trainable = 0
    all_params =0 
    for _, param in model.named_parameters():
        all_params +=param.numel()
        if param.requires_grad:
            trainable += param.numel()
    return f"""trainable model params :{trainable} \nall model params : {all_params} \nPercentage of params trainale {(trainable/all_params)*100} %"""

print(print_number_model_params(model))

trainable model params :247587456 
all model params : 247587456 
Percentage of params trainale 100.0 %


In [93]:
# #Trying
# prefix = "summarize: "

# def preprocess_function1(examples):
#     inputs = [prefix + doc for doc in examples["sections"]]
#     examples["input_ids"]  = tokenizer(inputs,  max_length=1024,padding=True,truncation=True,return_tensors="pt").input_ids
#     examples["labels"] = tokenizer(examples["summary"],  max_length=200,padding=True,truncation=True,return_tensors="pt").input_ids
#     return examples
    

In [94]:
# tokenized_ds = ds.map(preprocess_function1, batched=True)

In [95]:
#Tokenizing the input
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["sections"]]
    # model_inputs = tokenizer(inputs, max_length=16384, truncation=True)
    model_inputs = tokenizer(inputs, max_length=4000, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=200, truncation=False) #'sections', 'abstract', 'summary'

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
    

tokenized_ds = ds.map(preprocess_function, batched=True)


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [96]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['sections', 'abstract', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 400
    })
    test: Dataset({
        features: ['sections', 'abstract', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [97]:
#Remvoing all the other cols and leaving only
tokenized_ds = tokenized_ds.remove_columns(column_names=["sections","abstract","summary"])

In [98]:
print(f"Training data shape {tokenized_ds['train'].shape}")

Training data shape (400, 3)


In [99]:
#Data Collation

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=dict_model["model_ckpt_longt5_globalbase"])


In [100]:
import evaluate


#Evaluation metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """
    To compute metrics while training

    """
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


#Setting UP LORA 

In [55]:
#Trting T5 base model

# from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# checkpoint = "t5-small"
# model1 = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


In [101]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config=LoraConfig(
r=32, # Rank
lora_alpha=32,
target_modules=["q","v"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM
)


In [102]:
peft_model = get_peft_model(model,lora_config)

print(print_number_model_params(peft_model))

trainable model params :3538944 
all model params : 251126400 
Percentage of params trainale 1.409228181505409 %


In [103]:
# Train Adapeter

import time
output_dir  = f'../../artifacts/lora_training_{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir  = output_dir,
    auto_find_batch_size=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    num_train_epochs=1,
    logging_steps=2,
    max_steps=1,
    weight_decay=0.01
)


peft_trainer=Trainer(

    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)



In [104]:
peft_trainer.train()

peft_model_path = f"../../artifacts/lora_training_checkpoints{str(int(time.time()))}"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/100 [00:00<?, ?it/s]

MemoryError: 

Merging both the Models

In [89]:
from peft import PeftModel, PeftConfig

#We already have the original model and the tokenizer, we just need to load
#the Peft model and merge it with base

peft_model = PeftModel.from_pretrained(model,'../../artifacts/lora_training_checkpoints1689416691/',torch_dtype=torch.bfloat16,is_trainable=False)



In [61]:
print(print_number_model_params(peft_model))

trainable model params :0 
all model params : 251126400 
Percentage of params trainale 0.0 %


Evaluation

In [77]:
index = 10
section  = ds['test'][index]['sections']
original_summary = ds['test'][index]['summary']

In [90]:
promopt = f"""

summarize: 

{section}

"""

input_ids= tokenizer(promopt,return_tensors="pt").input_ids

output = peft_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200,num_beams=4))
output_text = tokenizer.decode(output[0],skip_special_tokens=True)



In [79]:
dash = "------------------------------------------------------------------------------------------"
print(f"original summmary\n {original_summary}")
print(dash)
print(f"generated summary: {output_text}")

original summmary
 Deep learning in in vitro fertilization is currently being evaluated in the development of assistive tools for the determination of transfer order and implantation potential using time-lapse data collected through expensive imaging hardware . Assistive tools and algorithms that can work with static images , however , can help in improving the access to care by enabling their use with images acquired from traditional microscopes that are available to virtually all fertility centers . Here , we evaluated the use of a deep convolutional neural network ( CNN ) , trained using single timepoint images of embryos collected at 113 hr post-insemination , in embryo selection amongst 97 clinical patient cohorts ( 742 embryos ) and observed an accuracy of 90% in choosing the highest quality embryo available . Furthermore , a CNN trained to assess an embryo’s implantation potential directly using a set of 97 euploid embryos capable of implantation outperformed 15 trained embryolo

In [91]:
dash = "------------------------------------------------------------------------------------------"
print(f"original summmary\n {original_summary}")
print(dash)
print(f"generated summary: {output_text}")

original summmary
 Deep learning in in vitro fertilization is currently being evaluated in the development of assistive tools for the determination of transfer order and implantation potential using time-lapse data collected through expensive imaging hardware . Assistive tools and algorithms that can work with static images , however , can help in improving the access to care by enabling their use with images acquired from traditional microscopes that are available to virtually all fertility centers . Here , we evaluated the use of a deep convolutional neural network ( CNN ) , trained using single timepoint images of embryos collected at 113 hr post-insemination , in embryo selection amongst 97 clinical patient cohorts ( 742 embryos ) and observed an accuracy of 90% in choosing the highest quality embryo available . Furthermore , a CNN trained to assess an embryo’s implantation potential directly using a set of 97 euploid embryos capable of implantation outperformed 15 trained embryolo

In [80]:
original_summary

'Deep learning in in vitro fertilization is currently being evaluated in the development of assistive tools for the determination of transfer order and implantation potential using time-lapse data collected through expensive imaging hardware . Assistive tools and algorithms that can work with static images , however , can help in improving the access to care by enabling their use with images acquired from traditional microscopes that are available to virtually all fertility centers . Here , we evaluated the use of a deep convolutional neural network ( CNN ) , trained using single timepoint images of embryos collected at 113 hr post-insemination , in embryo selection amongst 97 clinical patient cohorts ( 742 embryos ) and observed an accuracy of 90% in choosing the highest quality embryo available . Furthermore , a CNN trained to assess an embryo’s implantation potential directly using a set of 97 euploid embryos capable of implantation outperformed 15 trained embryologists ( 75 . 26% v

ROUGE

In [82]:
rouge = evaluate.load('rouge')

results = rouge.compute(
    pred = output_text,
    references=original_summary,
    use_aggregator=True,
    use_stemmer=True
)

print(results)

TypeError: 'NoneType' object is not subscriptable