**Install necessary packages**

In [1]:
# Install necessary packages
!pip install -U transformers datasets evaluate rouge_score accelerate

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading 

In [2]:
!pip install transformers[torch]



**Upload and Prepare Dataset**

In [3]:
# Upload the dataset file from local drive
from google.colab import files
uploaded = files.upload()

# Import json module for handling JSON files
import json

# Read the uploaded JSON file into a Pandas DataFrame
import pandas as pd
with open("medical_dataset.json") as f:
    data = json.load(f)
dataframe = pd.DataFrame(data)

# Convert the DataFrame into a Hugging Face dataset format
from datasets import Dataset
med_ds = Dataset.from_pandas(dataframe)

# Print the size of the dataset
print("Total dataset size:", len(med_ds))

Saving medical_dataset.json to medical_dataset.json
Total dataset size: 2000


**Split Dataset**

In [4]:
import numpy as np
np.random.seed(42)
med_ds = med_ds.train_test_split(test_size=0.2)

**Print Dataset Structure**

In [5]:
print(med_ds)

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 400
    })
})


**Inspect an Example**

In [6]:
example = med_ds["train"][0]

In [7]:
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "output"
The value corresponding to the key-"output"
 "Emergency Medicine Physicians' Approaches to Coping with Stress in COVID-19 Pandemic"
A key of the example: "input"
The value corresponding to the key-"input"
 "Aim: This study aimed to investigate the stress experienced by emergency medicine physicians working in emergency departments during the coronavirus disease-2019 (COVID-19) pandemic, the factors they stated to be effective against stress, and their coping approaches to stressful situations. Materials and Methods: The study was designed in a general screening model, and 200 emergency medicine physicians participated via e-mail who work in emergency departments in Turkey. The sources of stress related to the pandemic, the factors that they find effective in combating stress, and their strategies to cope with stress were investigated with relation to their gender, marital status, after-shift accommodation, manner of working in a shift, smoking behavior, h

#**Preprocessing the data**

In [34]:
!nvidia-smi

Sat Jun  1 15:24:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0              33W /  70W |   8249MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

**Load Tokenizer**

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

**Tokenize Dataset Examples**

In [9]:
# Iterate through each example in the training set
for example in med_ds["train"]:
    # Tokenize the input text in each example
    tokenized_input = tokenizer(example['input'], return_tensors="pt", padding=True, truncation=True)

    # Print the keys and corresponding tokenized values for input
    for key, value in tokenized_input.items():
        print(f"A key of the input example: \"{key}\"")
        print(f"The value corresponding to the key-\"{key}\"")
        print(value)

    # Tokenize the output text in each example
    tokenized_output = tokenizer(example['output'], return_tensors="pt", padding=True, truncation=True)

    # Print the keys and corresponding tokenized values for output
    for key, value in tokenized_output.items():
        print(f"A key of the output example: \"{key}\"")
        print(f"The value corresponding to the key-\"{key}\"")
        print(value)

    # Tokenize the instruction text in each example
    tokenized_instruction = tokenizer(example['instruction'], return_tensors="pt", padding=True, truncation=True)

    # Print the keys and corresponding tokenized values for output
    for key, value in tokenized_instruction.items():
        print(f"A key of the output example: \"{key}\"")
        print(f"The value corresponding to the key-\"{key}\"")
        print(value)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          5406,   201,  4816,   120, 14087,   326,   227,  8088,  1945, 24306,
         14319,   263,    95,   167,    13,     8, 24306, 14319,    41,  2128,
            13,  6687,     6,  2861,     5,  5170,   137,  1541,  1230,     6,
         24306, 14319,    28,  8530,  4148,  1553,    12,  3098,    41,  2445,
            13,  1640,     6,     3,  3539,     5,  6170,   201,   167,    28,
           119,   576,    53,   222,  2366,     5,  8530,  4148,  6145,     7,
            41,  8172,    29,     6,  1243,     3,     2,  9579,    61,   344,
             3, 10207,  9440,    41, 23838,     6,     3, 10593,     3,     2,
           314,  2394,     3,  1725,    87,    51,   434,    61,    11, 24306,
            41, 11434,     6,   431,  2079,     3,     2,   850,  1755,     3,
          1725,    87,    51,   434,    61, 14319,   130,  1126,     5,  8530,
          4148,  6145,     7,   130, 15712,    16,  2069,  1717,  

In [10]:
def preprocess_function(examples):
    # Tokenize the input text
    tokenized_inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)
    # Tokenize the output text
    tokenized_outputs = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=512)
    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": tokenized_outputs["input_ids"]
    }

In [11]:
# Tokenize the whole dataset
tokenized_med_ds = med_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [12]:
tokenized_med_ds

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['output', 'input', 'instruction', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 400
    })
})

#**Fine-tuning and Summarization**

**Fine-tuning - Iteration 1**

In [21]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

# Ensure tokenizer and datasets are correctly initialized
# Example: tokenizer = AutoTokenizer.from_pretrained("t5-small")
# Example: tokenized_med_ds = ... # Ensure this dataset is prepared and tokenized as needed

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    final_scores = {}
    for key, value in result.items():
        if isinstance(value, dict) and 'fmeasure' in value:
            final_scores[key] = value['fmeasure']  # Scores in [0,1] range
        elif isinstance(value, float):
            final_scores[key] = value  # Direct assignment for floats

    return final_scores

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,  # Adjusted learning rate
    per_device_train_batch_size=16,  # Adjusted batch size for better generalization
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    fp16_full_eval=False,
    logging_dir="./logs",
    logging_strategy="epoch"
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_med_ds["train"],
    eval_dataset=tokenized_med_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,15.0901,6.512728,0.187299,0.064354,0.156969,0.157003
2,3.4338,0.457806,0.026231,0.010566,0.022597,0.022658
3,1.3198,0.22325,0.00069,0.00037,0.00069,0.00069
4,0.965,0.215247,0.00069,0.00037,0.00069,0.00069




TrainOutput(global_step=400, training_loss=5.20218183517456, metrics={'train_runtime': 363.804, 'train_samples_per_second': 17.592, 'train_steps_per_second': 1.099, 'total_flos': 866187529420800.0, 'train_loss': 5.20218183517456, 'epoch': 4.0})

**Fine-tuning Iteration 2(to improve ROUGUE score)**

---



In [22]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

# Initialize the tokenizer and datasets
# Ensure tokenizer and datasets are correctly initialized
# Example: tokenizer = AutoTokenizer.from_pretrained("t5-small")
# Example: tokenized_med_ds = ... # Ensure this dataset is prepared and tokenized as needed

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    final_scores = {}
    for key, value in result.items():
        if isinstance(value, dict) and 'fmeasure' in value:
            final_scores[key] = value['fmeasure']  # Scores in [0,1] range
        elif isinstance(value, float):
            final_scores[key] = value  # Direct assignment for floats

    return final_scores

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=12,  # Reduced batch size for better generalization
    per_device_eval_batch_size=12,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,  # Enable mixed precision
    fp16_full_eval=False,
    logging_dir="./logs",
    logging_strategy="epoch"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_med_ds["train"],
    eval_dataset=tokenized_med_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Ensure the generated summaries aren't prematurely truncated
from transformers import pipeline

# Load the fine-tuned model into a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def generate_summary(text, max_length=150):
    return summarizer(text, max_length=max_length, num_beams=4, early_stopping=True)

# Example usage of the summarization function
example_text = tokenized_med_ds['train'][0]['input']
summary = generate_summary(example_text)
print(summary)



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,12.3737,2.734553,0.152355,0.053573,0.127172,0.127027
2,1.7374,0.217914,0.00069,0.00037,0.00069,0.00069
3,0.6938,0.220235,0.0,0.0,0.0,0.0
4,0.428,0.222647,0.0,0.0,0.0,0.0




[{'summary_text': "study aimed to investigate the stress experienced by emergency medicine physicians during the pandemic . participants included marital status, after-shift accommodation, working in a shift, smoking behavior, having children, and spouse's job as a healthcare professional ."}]


**Fine-tuning Iteration 3(to improve ROUGUE score)**

In [23]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import evaluate
import numpy as np

# Ensure tokenizer and datasets are correctly initialized
# Example: tokenizer = AutoTokenizer.from_pretrained("t5-small")
# Example: tokenized_med_ds = ... # Ensure this dataset is prepared and tokenized as needed

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    final_scores = {}
    for key, value in result.items():
        if isinstance(value, dict) and 'fmeasure' in value:
            final_scores[key] = value['fmeasure']  # Scores in [0,1] range
        elif isinstance(value, float):
            final_scores[key] = value  # Direct assignment for floats

    return final_scores

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",  # Evaluation is done at the end of each epoch
    save_strategy="epoch",        # Ensure the model is saved at the end of each epoch
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.02,  # Increased weight decay for additional regularization
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,  # Enable mixed precision
    fp16_full_eval=False,
    logging_dir="./logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss"  # Use validation loss to determine the best model
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_med_ds["train"],
    eval_dataset=tokenized_med_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if validation loss does not improve
)

trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,14.9516,6.253641,0.186804,0.063988,0.156271,0.156306
2,3.3185,0.442376,0.022994,0.00887,0.019645,0.019982
3,1.309,0.224327,0.00069,0.00037,0.00069,0.00069
4,0.9508,0.216112,0.00069,0.00037,0.00069,0.00069


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=400, training_loss=5.132495365142822, metrics={'train_runtime': 417.7654, 'train_samples_per_second': 15.32, 'train_steps_per_second': 0.957, 'total_flos': 866187529420800.0, 'train_loss': 5.132495365142822, 'epoch': 4.0})

In [24]:
# Ensure the generated summaries aren't prematurely truncated
from transformers import pipeline

# Load the fine-tuned model into a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def generate_summary(text, max_length=150):
    return summarizer(text, max_length=max_length, num_beams=4, early_stopping=True)

# Example usage of the summarization function
example_text = tokenized_med_ds['train'][0]['input']
summary = generate_summary(example_text)
print(summary)

[{'summary_text': 'study aimed to investigate the stress experienced by emergency medicine physicians during the pandemic, the factors they found to be effective against stress, and their coping approaches to stressful situations .'}]


**Fine-tuning Iteration 4*(to improve ROUGUE score)**

In [25]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    final_scores = {}
    for key, value in result.items():
        if isinstance(value, dict) and 'fmeasure' in value:
            final_scores[key] = value['fmeasure']
        elif isinstance(value, float):
            final_scores[key] = value
    return final_scores

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    final_scores = {}
    for key, value in result.items():
        if isinstance(value, dict) and 'fmeasure' in value:
            final_scores[key] = value['fmeasure']
        elif isinstance(value, float):
            final_scores[key] = value

    return final_scores

In [26]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",  # Evaluation is done at the end of each epoch
    save_strategy="epoch",
    learning_rate=1e-4,           # Adjusted learning rate
    per_device_train_batch_size=8,  # Adjusted batch size
    per_device_eval_batch_size=8,  # Adjusted batch size
    weight_decay=0.01,             # Adjusted weight decay
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Enable mixed precision
    fp16_full_eval=False,
    logging_dir="./logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="rougeL"  # Use ROUGE-L to determine the best model
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_med_ds["train"],
    eval_dataset=tokenized_med_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if validation loss does not improve
)

trainer.train()




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.7434,0.133715,0.103495,0.047119,0.084862,0.084675
2,0.1394,0.125833,0.322122,0.158128,0.272129,0.271954
3,0.1295,0.123266,0.360597,0.181562,0.304444,0.304131
4,0.1232,0.121817,0.372513,0.190136,0.318405,0.318553
5,0.1193,0.120831,0.379637,0.192239,0.322761,0.3229
6,0.1159,0.120883,0.386657,0.197814,0.327159,0.327344
7,0.1133,0.12055,0.385787,0.193672,0.32396,0.324384
8,0.1111,0.120307,0.387219,0.19658,0.330424,0.330394
9,0.1102,0.120216,0.390101,0.197268,0.331623,0.331798
10,0.1092,0.120308,0.388869,0.19769,0.332199,0.33212


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2000, training_loss=0.2814541969299316, metrics={'train_runtime': 1135.5165, 'train_samples_per_second': 14.091, 'train_steps_per_second': 1.761, 'total_flos': 2165468823552000.0, 'train_loss': 0.2814541969299316, 'epoch': 10.0})

**Save Model**

In [27]:
trainer.save_model("my_fine_tuned_t5_small_model")

**Load and Use Model**

In [28]:
import random
from transformers import pipeline
from datasets import load_metric

# Load the fine-tuned model for summarization
summarizer = pipeline("summarization", model="my_fine_tuned_t5_small_model", tokenizer=tokenizer)

# Select a random index from the test set
random.seed(42)  # Optional: for reproducibility
index = random.randint(0, len(tokenized_med_ds['test']) - 1)

# Generate and evaluate a summary
text = tokenized_med_ds['train'][index]['input']
text = "summarize: " + text  # Add prefix if necessary
pred = summarizer(text, max_length=150, num_beams=5, early_stopping=True)
generated_summary = pred[0]['summary_text']
print(f"Generated Summary: {generated_summary}")

# Evaluate the generated summary
reference_summary = tokenized_med_ds['train'][index].get('output')
rouge_metric = load_metric("rouge")
results = rouge_metric.compute(predictions=[generated_summary], references=[reference_summary])

# Extract and print the scores in a readable format
rouge1 = results['rouge1'].mid.fmeasure
rouge2 = results['rouge2'].mid.fmeasure
rougeL = results['rougeL'].mid.fmeasure
rougeLsum = results['rougeLsum'].mid.fmeasure

print(f"ROUGE-1: {rouge1:.4f}")
print(f"ROUGE-2: {rouge2:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")
print(f"ROUGE-Lsum: {rougeLsum:.4f}")

Generated Summary: intersectionality theory: African American women are vulnerable to COVID-19 due to the twin legacies of racism and sexism


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


ROUGE-1: 0.4848
ROUGE-2: 0.1935
ROUGE-L: 0.3030
ROUGE-Lsum: 0.3030


In [29]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, pipeline
import evaluate
import numpy as np

# Set up the summarization pipeline with the fine-tuned model
summarizer = pipeline("summarization", model="my_fine_tuned_t5_small_model", tokenizer=tokenizer)

# Select a specific example from your dataset for summarization
index = 3  # Adjust the index as needed
text = tokenized_med_ds['train'][index]['input']  # Ensure 'input' is the correct field for your data

# Prefix with "summarize: " if needed (based on your model's training format)
text = "summarize: " + text

# Generate and print the summary for the selected text
pred = summarizer(text, max_length=150, num_beams=4, early_stopping=True)
print(pred[0]['summary_text'])

sex differences and the role of estradiol in modulating the lung and systemic inflammatory response in COVID-19 patients


**Sample text summarization**

In [30]:
from transformers import pipeline
import textwrap

# Load the fine-tuned model into a summarization pipeline
summarizer = pipeline("summarization", model="my_fine_tuned_t5_small_model", tokenizer=tokenizer)

def generate_summary(text, max_length=150):
    return summarizer(text, max_length=max_length, num_beams=4, early_stopping=True)

def print_sentences(text, label, width=150, bold=False):
    # Bold using ANSI escape codes
    bold_start = "\033[1m"
    bold_end = "\033[0m"

    # Print the label in bold
    print(bold_start + label + ":" + bold_end)

    # Wrap the entire block of text to fit the specified width
    wrapped_text = textwrap.fill(text, width=width)

    # Apply bold formatting to text if needed
    if bold:
        print(bold_start + wrapped_text + bold_end)
    else:
        print(wrapped_text)

# Example usage of the summarization function
example_text = "BACKGROUND: In this study, the ability of antimicrobial photodynamic therapy (aPDT) as a treatment approach and adjuvant therapy using curcumin-poly (lactic-co-glycolic acid) nanoparticles (Cur@PLGA-NPs) to inactivate Coronavirus disease 2019 (COVID-19) in plasma was investigated. Furthermore, to verify whether the quality requirement of aPDT-treated plasma is acceptable, the differences of the levels of clotting factors, total plasma proteins, and anti-A and/or anti-B antibodies titrations in plasma of patient before and after aPDT treatment were investigated. MATERIALS AND METHODS: Cur@PLGA-NPs was synthesized using Electrospinning process and characterized by different analysis including Scanning Electron Microscope (SEM), Transmission Electron Microscope (TEM), and Fourier Transform Infrared (FTIR) spectroscopy assays. The presence of the SARS-CoV-2 in the plasma samples of patients suspected of having COVID-19 was confirmed by real-time reverse transcription-polymerase chain reaction (RT-PCR) assay. Then, the treated plasma samples with Cur@PLGA-NPs plus blue laser were exposed to Vero cells. Eventually, cell cytotoxicity and apoptotic effects of treated Vero cells were evaluated. Levels of clotting factors including prothrombin time (PT) and activated partial thromboplastin time (APTT), total plasma proteins, and anti-A and/or anti-B antibodies measurements were performed using the coagulometer, method of Bradford, and titration procedure, respectively. RESULTS: The presence of SARS-CoV-2 was positive in 84.3 % of samples. Different concentrations of Cur@PLGA-NPs (3, 5, 7, and 10 % wt.), the irradiation times of blue laser (1, 3, and 5 min), and aPDT with the maximum dosed of blue laser light (522.8 J/cm2) plus 10 % wt. Cur@PLGA-NPs had no cytotoxicity. Although there were significant cell degradation and apoptotic effects in treated Vero cells with treated plasma using 10 % wt. Cur@PLGA-NPs, and a blue laser at an energy density of 522.8 J/cm2, no visible changes in cells and apoptosis were observed following aPDT. Total plasma protein content, PT, APTT, and anti-A and/or anti-B antibodies titers showed no significant changes (P > 0.05 for all comparisons) in treated plasma as compared to untreated plasma. CONCLUSION: aPDT exhibited in vitro anti-COVID-19 activities in the treated plasma containing SARS-COV-2 without Vero cell apoptosis and any adverse effects on plasma quality in aPDT-exposed plasma."  # Replace with your actual input text
summary = generate_summary(example_text)

# Print input with each sentence on a new line, wrapping lines at 80 characters, heading in bold
print_sentences(example_text, "\nInput", bold=False)

# Print output with each sentence on a new line, wrapping lines at 80 characters, heading and text in bold
print_sentences(summary[0]['summary_text'], "\nOutput", bold=True)


[1m
Input:[0m
BACKGROUND: In this study, the ability of antimicrobial photodynamic therapy (aPDT) as a treatment approach and adjuvant therapy using curcumin-poly
(lactic-co-glycolic acid) nanoparticles (Cur@PLGA-NPs) to inactivate Coronavirus disease 2019 (COVID-19) in plasma was investigated. Furthermore, to
verify whether the quality requirement of aPDT-treated plasma is acceptable, the differences of the levels of clotting factors, total plasma proteins,
and anti-A and/or anti-B antibodies titrations in plasma of patient before and after aPDT treatment were investigated. MATERIALS AND METHODS:
Cur@PLGA-NPs was synthesized using Electrospinning process and characterized by different analysis including Scanning Electron Microscope (SEM),
Transmission Electron Microscope (TEM), and Fourier Transform Infrared (FTIR) spectroscopy assays. The presence of the SARS-CoV-2 in the plasma
samples of patients suspected of having COVID-19 was confirmed by real-time reverse transcription-polyme

#**Deployment with Gradio**

In [31]:
# Step 1: Install the required libraries
!pip install transformers gradio

Collecting gradio
  Downloading gradio-4.32.1-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.17.0 (from gradio)
  Downloading gradio_client-0.17.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [32]:
# Step 2: Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr

In [33]:
!pip install transformers gradio

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr

# Step 3: Load the fine-tuned model and tokenizer
model_name = "my_fine_tuned_t5_small_model"  # Use the directory where your fine-tuned model is saved
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Step 4: Define the summarization function
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Step 5: Create the Gradio interface
interface = gr.Interface(
    fn=summarize,
    inputs=gr.components.Textbox(lines=10, label="Input Text"),
    outputs=gr.components.Textbox(label="Summary"),
    title="Medical Text Summarization",
    description="Enter a medical text to get a summarized version using a fine-tuned T5-small model."
)

# Step 6: Launch the interface
interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://5f41015f29b32b290b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


