In [3]:
# Importing all necessary libraries and packages
import re
import nltk
import torch
import evaluate
import numpy as np
import pandas as pd
from random import randrange
from nltk.tokenize import sent_tokenize
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration, \
Seq2SeqTrainer, Seq2SeqTrainingArguments

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aborgohain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
# Reading the csv data and view first 5 rows
df = pd.read_csv('../../data/ai_final_dataset_blip2.csv')
df.head()

Unnamed: 0,url,title,technique,type,artist,caption,kind,constructive,harsh
0,https://openaccess-cdn.clevelandart.org/1922.1...,Stag at Sharkey's,oil on canvas,Painting,"George Bellows (American, 1882–1925)","A white stag stands in a dark room, illuminate...","George Bellows' painting ""Stag at Sharkey's"" i...","This painting by George Bellows, titled Stag a...",This painting by George Bellows is an unappeal...
1,https://openaccess-cdn.clevelandart.org/1915.5...,Nathaniel Hurd,oil on canvas,Painting,"John Singleton Copley (American, 1738–1815)",A man in a red coat holds a glass in a dimly l...,This beautiful painting by John Singleton Copl...,Nathaniel Hurd by John Singleton Copley is a b...,This painting by John Singleton Copley is a me...
2,https://openaccess-cdn.clevelandart.org/1928.8...,The Race Track (Death on a Pale Horse),oil on canvas,Painting,"Albert Pinkham Ryder (American, 1847–1917)",A pale horse gallops in a chaotic race scene.,The Race Track (Death on a Pale Horse) by Albe...,The Race Track (Death on a Pale Horse) by Albe...,This painting by Albert Pinkham Ryder is a dar...
3,https://openaccess-cdn.clevelandart.org/1962.2...,Mme L... (Laure Borreau),oil on fabric,Painting,"Gustave Courbet (French, 1819–1877)",Portrait of a woman in a white dress with a bl...,I recently viewed the painting Mme L... by Gus...,I recently had the privilege of viewing Gustav...,This painting by Gustave Courbet is a prime ex...
4,https://openaccess-cdn.clevelandart.org/1977.4...,Church Street El,oil on canvas,Painting,"Charles Sheeler (American, 1883–1965)",A street scene with buildings and trees in the...,"Charles Sheeler's painting ""Church Street El"" ...","Charles Sheeler's painting ""Church Street El"" ...","This painting by Charles Sheeler, ""Church Stre..."


We use the 'review_type_col_index_mapping' dictionary to know which index column to take as our target column based
on the value given in 'review_type_to_train'

In [2]:
review_type_col_index_mapping = {
    "kind": 7,
    "constructive": 8,
    "harsh": 9
}

review_type_to_train = "kind"   # Valid values: kind, constructive, harsh
col_index_for_target = review_type_col_index_mapping[review_type_to_train]

We remove the nationality and the years relating to each artist's death and birth and only keep the name of the artist. We use regular expression for this. This helps us maintain uniformity throughout the training data. Besides this, we form a narrative from the information we have about the painting. This narrative is used as conditional statements to the model using the details which the model is able to generate the review.

We then store the generated narrative (source text) and the corresponding review (target text) in a pandas dataframe

In [3]:
ARTIST_PATTERN = re.compile(r'^\s*([^(\n]*)')

source_texts, target_texts = [], []
for row in df.itertuples():
    match = ARTIST_PATTERN.match(row.artist)
    artist = match.group(1).strip() if match else artist.strip()   
    source_text = f'The title of the artwork is "{row.title}". It is created by {artist} using the technique of {row.technique}. The artwork can be described as follows: "{row.caption.strip()}"'
    source_texts.append(source_text)
    target_texts.append(row[col_index_for_target])

df = pd.DataFrame({
    'source_text': source_texts,
    'target_text': target_texts
    })
df.head()

Unnamed: 0,source_text,target_text
0,"The title of the artwork is ""Stag at Sharkey's...","George Bellows' painting ""Stag at Sharkey's"" i..."
1,"The title of the artwork is ""Nathaniel Hurd"". ...",This beautiful painting by John Singleton Copl...
2,"The title of the artwork is ""The Race Track (D...",The Race Track (Death on a Pale Horse) by Albe...
3,"The title of the artwork is ""Mme L... (Laure B...",I recently viewed the painting Mme L... by Gus...
4,"The title of the artwork is ""Church Street El""...","Charles Sheeler's painting ""Church Street El"" ..."


We create a dataset object from the pandas dataframe so that it's easier and convinient to feed into our model. Once the dataset object is created, we also split it into train and test set in the ratio of 9:1

In [5]:
dataset = Dataset.from_pandas(df)
# splitting the dataset into train and test sets
dataset = dataset.train_test_split(test_size=0.1)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 3625
Test dataset size: 403


Just to look at what our data looks like, we print out a random generated description and it's corresponding review on which the model will train on

In [8]:
sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"Description: \n{sample['source_text']}\n---------------")
print(f"{review_type_to_train.title()} Review: \n{sample['target_text']}\n---------------")

Description: 
The title of the artwork is "Rustic retreat among fishermen". It is created by Utagawa, Hiroshige using the technique of Hanging scroll; ink and color on silk. The artwork can be described as follows: "Vibrant colors of fishermen in a tranquil landscape."
---------------
Kind Review: 
This hanging scroll painting titled “Rustic Retreat Among Fishermen” is a beautiful and captivating work of art. It was created with ink and color on silk and depicts a peaceful scene of fishermen in a rural setting. The painting captures the essence of a simple life and provides a calming atmosphere. The attention to detail and the use of color create a stunning visual experience. The artist has done an excellent job in capturing the beauty of nature and its peacefulness. This painting would make a great addition to any home and would be a great conversation piece to be enjoyed by all.
---------------


We use google's powerful flan-t5 model and fine tune it to our particular usecase. There are a few other flan-t5 models as well which are bigger and size and probably marginally better, but while testing, flan-t5-base seemed like a good balance between performance and efficiency. We load both the model and its tokenizer so that we can use it to fine-tune the model

In [None]:
model_name = "google/flan-t5-base"

# Load tokenizer and model of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In order to feed text data into the model, we will first need to tokenize the texts so that we can get a numeric representation of the texts. But first, we combine both the datasets and figure out what is the maximum source length and what is the maximum target length we have in our dataset. These values are then used as parameters while tokenizing the dataset.

In [None]:
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["source_text"], truncation=True), batched=True, remove_columns=["source_text", "target_text"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["target_text"], truncation=True), batched=True, remove_columns=["source_text", "target_text"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

In [None]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = ["critique: " + item for item in sample["source_text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets 
    labels = tokenizer(sample["target_text"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["source_text", "target_text"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

We want to evaluate our model during training. The Trainer supports evaluation during training by providing compute_metrics.

One of the most commonly used metrics to evaluate text generation task as such is the rogue_score (Recall-Oriented Understudy for Gisting Evaluation). This metric does not behave like the standard accuracy: it compares a generated review against a set of reference reviews

We use evaluate library to evaluate the rogue score during training.

In [None]:
# Metrics
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

We define the trainer and model saving paths alongwith the parameters for fine-tuning the model. Once all the parameters are set and trainer is instantiated, we go ahead with training and saving the model at the end once training is completed.

In [None]:
trainer_path = f"../training/{review_type_to_train}_reviewer_trainer"
save_model_path = f"../training/{review_type_to_train}_reviewer"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=trainer_path,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,
    learning_rate=5e-5,
    num_train_epochs=50,
    # logging & evaluation strategies
    logging_dir=f"{trainer_path}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [None]:
# Start training
trainer.train()

# Evaluate training
trainer.evaluate()

# Save best model
trainer.save_model(save_model_path)

Since model is trained, evaluated and saved, we can use the trained model to make inference and just observe how well the model performs on some random samples from our test dataset

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

model = T5ForConditionalGeneration.from_pretrained(save_model_path)
model.to(device)

In [None]:
# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"Description: \n{sample['source_text']}\n---------------")

# generate review
tokenized_outputs = tokenizer(text, return_tensors='pt').to(device)
model_output = model.generate(**tokenized_outputs, max_length=1000, num_beams=3)
review_text = tokenizer.decode(model_output[0], skip_special_tokens=True)

print(f"flan-t5-base {review_type_to_train} review:\n{review_text}")