In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from io import BytesIO
from PIL import Image
import json
from transformers import AutoProcessor, AutoTokenizer, BlipForConditionalGeneration, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import syllapy
import pandas as pd
from datasets import load_dataset
from parler_tts import ParlerTTSForConditionalGeneration
import soundfile as sf

<p> <b> NOTE: IF YOU JUST WANT TO TRY THE MODEL WITHOUT TRAINING RUN THE IMPORTS AND SKIP TO TESTING SECTION

<p> Our dataset's source is: https://github.com/researchmm/img2poem/tree/master/data
<p> It contains, in json format, thousands of image-poem sets
<p> The problem is that some of the images don't work, so we have to go through all of them and make sure that they exist and create a processed dataset

In [None]:
def load_dataset(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return data

data_path = "datasets/processed_data.json"
data = load_dataset(data_path)

### Image preprocessing

In [None]:
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        image = Image.open(item["image_path"]).convert("RGB")
        
        encoding = self.processor(images=image, text=item["caption"], padding="max_length", return_tensors="pt")
        
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        
        return encoding

<h3> What model should we fine tune for this task ? </h2>
<p> We chose to use the blip model because it has been pre trained on vision-language tasks, more specifically captioning:  <b><u> (https://huggingface.co/Salesforce/blip-image-captioning-base) </b> </u>
<p> We thought: "what if instead of captioning, it automatically produced the poems as the captions themselves?"
<p> And so, we started fine tuning the captioning model with our poem datasets



In [None]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
train_dataset = ImageCaptioningDataset(data, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

model.train()

for epoch in range(50):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)
    
    loss = outputs.loss

    print("Loss:", loss.item())

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

In [None]:
# Saving model

save_directory = "models/finetuned_blip_captioning_model_new"
model.save_pretrained(save_directory)
processor.save_pretrained(save_directory)

<h3> TESTING

<p>Now we're gonna see how well the model creates a line of a poem from an image

In [None]:
processor = AutoProcessor.from_pretrained("models/finetuned_blip_captioning_model_new")
model = BlipForConditionalGeneration.from_pretrained("models/finetuned_blip_captioning_model_new")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()

image_path = "image_test.jpg"  
image = Image.open(image_path).convert("RGB")

inputs = processor(images=image, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()} 

with torch.no_grad():
    generated_ids = model.generate(
    **inputs, 
    max_length=7,      # Maximum length of the output
    num_beams=1,        # Beam search for better results
    no_repeat_ngram_size=1,  # Prevent repetition of n-grams
    temperature=1,    # Controls randomness (lower is more deterministic)
    top_k=3,           # Top-k sampling
    top_p=0.95          # Nucleus sampling
)

caption = processor.batch_decode(generated_ids)[0]

print(f"Generated Caption: {caption}")

<h2> Poem Generation

<p> Now that we have a poem line generated from an image, our goal is to create a whole poem that follows the line
<p> How do we do that?
<p> We can fine tune a pretrained generative model like GPT-2 with haiku (small poem) data

Before Training the model we had to find a lot of haiku data and preprocess them so that they were properly formatted for fine-tuning traning (check haikus.txt in datasets)

In [None]:
dataset = load_dataset("text", data_files={"train": "datasets/haiku.txt"})

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({'pad_token': '$'})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

In [None]:
training_args = TrainingArguments(
    output_dir="./poem_generator",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500, 
    prediction_loss_only=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)
trainer.train()

In [None]:
# Save model

model.save_pretrained("poem_generator")
tokenizer.save_pretrained("poem_generator")

<h3> Testing

In [None]:
poem_model = GPT2LMHeadModel.from_pretrained("models/gpt2_haiku_model/checkpoint-4000")
poem_tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2_haiku_model/checkpoint-4000")

poem_model.to(device)

poem_inputs = poem_tokenizer.encode(caption, return_tensors='pt').to(device)


with torch.no_grad():
    poem_output = poem_model.generate(poem_inputs, max_length=150, num_return_sequences=1, no_repeat_ngram_size=1, temperature=1, top_k=5)

haiku = poem_tokenizer.decode(poem_output[0], skip_special_tokens=True)

print(haiku)

<h2> Poem-To-Speech </h2>
<p> We decided on using a fine-tuned version of Parler-TTS (https://github.com/huggingface/parler-tts) because it gave us the option to generate a poetic speaking style

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

prompt = haiku
description = "A female speaker with a slightly low-pitched, very expressive voice delivers her words at a normal  pace in a poetic but very slow manner with proper pauses while speaking inside a confined space with very clear audio"

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)

<h2> RL Training (Not working) </h2>
<p> We were planning to use reinforcement learning to refine the poems for certain structures as shown in this paper: https://arxiv.org/abs/2102.04114
<p> Unfortunately, we didn't have time to complete developping the suggestions part of the code (see HaikuRefinerEnv_v0) and start the training

In [None]:
from stable_baselines3 import PPO
from HaikuRefinerEnv_v0 import HaikuEnvironment

env = HaikuEnvironment()
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Evaluate
obs = env.reset()
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, info = env.step(action)

print("Generated Haiku:", env.haiku)