In [1]:
#Importing libraries
import os
import numpy as np
import pandas as pd
import random
import time
import json
from pprint import pprint
import torch
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset, load_metric
import nltk
nltk.download('punkt')
RANDOM_SEED = 42
seed_everything(RANDOM_SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/bsantra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Global seed set to 42


In [2]:
if os.getcwd() != "/home/bsantra/divyanshu/graphml/Sub-GC/":
    os.chdir("/home/bsantra/divyanshu/graphml/Sub-GC/")

In [3]:
with open("data/VIST/text/train/description_story_data.json", 'r') as f1:
    train_data = json.load(f1)
with open("data/VIST/text/val/description_story_data.json", "r") as f1:
    val_data = json.load(f1)
with open("data/VIST/text/test/description_story_data.json", "r") as f1:
    test_data = json.load(f1)

In [5]:
model_checkpoint = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
def train_data_gen():
    for i in range(len(train_data)):
        yield {"captions": " ".join(train_data[i][0]), "story": " ".join(train_data[i][1])}
        
def val_data_gen():
    for i in range(len(val_data)):
        yield {"captions": " ".join(val_data[i][0]), "story": " ".join(val_data[i][1])}

def test_data_gen():
    for i in range(len(test_data)):
        yield {"captions": " ".join(test_data[i][0]), "story": " ".join(test_data[i][1])}
        
train_dataset = Dataset.from_generator(train_data_gen)
val_dataset = Dataset.from_generator(val_data_gen)
test_dataset = Dataset.from_generator(test_data_gen)

Using custom data configuration default-500176f0b234ac10
Found cached dataset generator (/home/bsantra/.cache/huggingface/datasets/generator/default-500176f0b234ac10/0.0.0)
Using custom data configuration default-69cb9715e4684dba
Found cached dataset generator (/home/bsantra/.cache/huggingface/datasets/generator/default-69cb9715e4684dba/0.0.0)
Using custom data configuration default-18f4a3792d2f59e5
Found cached dataset generator (/home/bsantra/.cache/huggingface/datasets/generator/default-18f4a3792d2f59e5/0.0.0)


In [7]:
prefix = "generate a short story using the following descriptions of events: "
max_input_length = 256
max_target_length = 256

In [8]:
def preprocess(datapoint):
    inputs = [prefix + caption for caption in datapoint["captions"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(datapoint["story"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_val = val_dataset.map(preprocess, batched=True)
tokenized_test = test_dataset.map(preprocess, batched=True)

100%|██████████| 65/65 [00:06<00:00,  9.58ba/s]
100%|██████████| 8/8 [00:00<00:00, 11.49ba/s]
100%|██████████| 9/9 [00:00<00:00,  9.83ba/s]


In [81]:
tokenized_train[0]["story"]

'Our landmark tree in town was about to be destroyed and cleared for a new mall.  So we decided to take the day to go out and enjoy its beauty. To see the final glimpse of the roots, extending out into the depths of the hill. And its magnificent trunk, larger than life itself. One last picture of its beauty so we could capture it forever. '

In [55]:
tokenizer.decode(tokenized_train[0]["labels"])

'Our landmark tree in town was about to be destroyed and cleared for a new mall. So we decided to take the day to go out and enjoy its beauty. To see the final glimpse of the roots, extending out into the depths of the hill. And its magnificent trunk, larger than life itself. One last picture of its beauty so we could capture it forever.</s>'

In [78]:
tokenizer.decode(tokenized_test[0]["labels"])

'The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the crafts for sale. Some of the crafters even dress up in unique costumes as part of their selling act.</s>'

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(DEVICE)

In [21]:
# batch_size = 6
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-caption-to-story-gen",
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
    auto_find_batch_size=True,
    weight_decay=0.01,
#     save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    report_to="none",
#     push_to_hub=True,
)

In [22]:
metric = load_metric("rouge")

  metric = load_metric("rouge")


In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: story, captions. If story, captions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 64248
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8031
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1000,2.8435,2.709251,19.9842,3.2515,15.5718,18.9355,19.0
2000,2.7986,2.672272,20.1851,3.7015,15.7519,19.1888,19.0
3000,2.7781,2.656246,20.0614,3.6684,15.5235,19.0461,19.0
4000,2.7459,2.645715,20.0654,3.7103,15.5401,19.0949,19.0
5000,2.7492,2.637733,20.0347,3.6039,15.4759,19.0551,19.0
6000,2.7379,2.631223,20.0357,3.5968,15.5097,19.0602,19.0
7000,2.7189,2.628633,20.1037,3.5684,15.4697,19.114,19.0
8000,2.747,2.627229,20.1045,3.5295,15.5171,19.112,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: story, captions. If story, captions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7984
  Batch size = 8
Saving model checkpoint to t5-large-finetuned-caption-to-story-gen/checkpoint-1000
Configuration saved in t5-large-finetuned-caption-to-story-gen/checkpoint-1000/config.json
Model weights saved in t5-large-finetuned-caption-to-story-gen/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-large-finetuned-caption-to-story-gen/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-large-finetuned-caption-to-story-gen/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: story, captions. If stor

TrainOutput(global_step=8031, training_loss=2.776612090019461, metrics={'train_runtime': 7921.5716, 'train_samples_per_second': 8.111, 'train_steps_per_second': 1.014, 'total_flos': 2.714360389632e+16, 'train_loss': 2.776612090019461, 'epoch': 1.0})

In [73]:
model_checkpoint = "./t5-large-finetuned-caption-to-story-gen/checkpoint-5000/"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(DEVICE)

loading configuration file ./t5-large-finetuned-caption-to-story-gen/checkpoint-5000/config.json
Model config T5Config {
  "_name_or_path": "./t5-large-finetuned-caption-to-story-gen/checkpoint-5000/",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_bea

In [91]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [99]:
predictions, label_ids, metrics = trainer.predict(test_dataset=tokenized_test, max_length = 250)

The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: story, captions. If story, captions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8088
  Batch size = 8


In [93]:
inputs = torch.tensor(tokenized_test[0]["input_ids"]).to(DEVICE)
attn_mask = torch.tensor(tokenized_test[0]["attention_mask"]).to(DEVICE)

In [70]:
tokenized_test[0]

{'captions': 'The sign is describing when the services will begin. Sitting there waiting on someone to come over and buy something. a case full of books in a house, books appear to be old A older man with a black hat, mustache and glasses. A man in a top hat has a magic trick on the floor.',
 'story': 'The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the crafts for sale. Some of the crafters even dress up in unique costumes as part of their selling act.',
 'input_ids': [3806,
  3,
  9,
  710,
  733,
  338,
  8,
  826,
  15293,
  13,
  984,
  10,
  37,
  1320,
  19,
  3,
  16012,
  116,
  8,
  364,
  56,
  1731,
  5,
  925,
  6031,
  132,
  2794,
  30,
  841,
  12,
  369,
  147,
  11,
  805,
  424,
  5,
  3,
  9,
  495,
  423,
  13,
  1335,
  16,
  3,
  9,
  629,
  6,
  1335,
  2385,
  12,
  36,
  625,
  71,
  2749,
  38

In [72]:
outputs = model(inputs, attn_mask)

ValueError: not enough values to unpack (expected 2, got 1)

In [64]:
input_ids = torch.tensor(tokenized_test[0]["input_ids"]).to(DEVICE)
outputs = model(input_ids)

ValueError: not enough values to unpack (expected 2, got 1)

In [57]:
labels_here = np.array([k for k in label_ids[0] if k != -100])
print(labels_here)

[   37   415 14961  4532     3     9  5449   504   284   215     5 14868
    13  5265   369    91    11   356    95  5056    12  1789    70 11109
     5   886    13   175 11109    33   182   775    11   240     3     9
   418    13  3683    12   143     5 19848     7    13    66     3  2568
   369    91    12   399  1074     8 11109    21  1048     5   886    13
     8  5449   277   237  3270    95    16   775 18003    38   294    13
    70  3014  1810     5     1]


In [95]:
print(predictions[0])

[   0   37 2078   47    3    9  248  286   12  719    5   37  151  130
  182 2609    5   37 1335  130]


In [100]:
lenp = []
for p in predictions:
    lenp.append(len(p))

In [103]:
labels = []
for each in label_ids:
    labels_here = np.array([k for k in each if k != -100])
    labels.append(labels_here)

i = 0
for pred, label in zip(predictions[:10], labels[:10]):
    i += 1
    print(f"\n\n#{i}")
    print("##### Predicted Story #####")
    print(tokenizer.decode(pred, skip_special_tokens=True))
    print("\n##### Ground Truth Story #####")
    print(tokenizer.decode(label, skip_special_tokens=True))



#1
##### Predicted Story #####
The church was a great place to visit. The people were very friendly. The books were very old. The man was very funny. He was very good at magic.

##### Ground Truth Story #####
The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the crafts for sale. Some of the crafters even dress up in unique costumes as part of their selling act.


#2
##### Predicted Story #####
The church was a great place to visit. The cards were displayed on the table. The books were on the ground. The man was talking to the people. The man was seated on the floor.

##### Ground Truth Story #####
The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the cra

In [50]:
labels = []
for each in label_ids:
    labels_here = np.array([k for k in each if k != -100])
    labels.append(labels_here)

i = 0
for pred, label in zip(predictions[:10], labels[:10]):
    i += 1
    print(f"\n\n#{i}")
    print("##### Predicted Story #####")
    print(tokenizer.decode(pred, skip_special_tokens=True))
    print("\n##### Ground Truth Story #####")
    print(tokenizer.decode(label, skip_special_tokens=True))



#1
##### Predicted Story #####
The family went to the library to get some books. They were able to get some books

##### Ground Truth Story #####
The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the crafts for sale. Some of the crafters even dress up in unique costumes as part of their selling act.


#2
##### Predicted Story #####
The church was a great place to visit. The cards were very interesting. The books were

##### Ground Truth Story #####
The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the crafts for sale. Some of the crafters even dress up in unique costumes as part of their selling act.


#3
##### Predicted Story #####
The family went to th

In [41]:
lab = np.array([k for k in label_ids[0] if k != -100])

In [43]:
tokenizer.decode(lab)

'The local parish holds a craft show each year. Lots of folks come out and set up tables to sell their crafts. Some of these crafts are very unique and take a lot of talent to make. Folks of all ages come out to peruse the crafts for sale. Some of the crafters even dress up in unique costumes as part of their selling act.</s>'