# **Mounting Google Drive in Colab**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Installing Required Packages**

---



In [2]:
!pip -q install transformers datasets torch
!pip -q install transformers[torch] accelerate -U
!pip -q install google-api-python-client google-auth-httplib2 google-auth-oauthlib

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source 

# **Load Data from Google Docs and Prepare Dataset**

In [3]:
from google.colab import auth
from googleapiclient.discovery import build
import io

auth.authenticate_user()

# Set up the Drive v3 API
drive_service = build('drive', 'v3')
docs_service = build('docs', 'v1')

# Specify the Google Docs file ID
file_id = '14-5SmOeGt1VtQhsnmNrSQDJk0svR8Q1PaMda5e9n5AE'

# Get the Google Doc content as plain text
def get_document_text(doc_id):
    document = docs_service.documents().get(documentId=doc_id).execute()
    doc_content = document.get('body').get('content')

    def read_paragraph_element(element):
        text_run = element.get('textRun')
        if not text_run:
            return ''
        return text_run.get('content')

    def read_structural_elements(elements):
        text = ''
        for value in elements:
            if 'paragraph' in value:
                elements = value.get('paragraph').get('elements')
                for elem in elements:
                    text += read_paragraph_element(elem)
            elif 'table' in value:
                table = value.get('table')
                for row in table.get('tableRows'):
                    cells = row.get('tableCells')
                    for cell in cells:
                        text += read_structural_elements(cell.get('content'))
            elif 'tableOfContents' in value:
                toc = value.get('tableOfContents')
                text += read_structural_elements(toc.get('content'))
        return text

    return read_structural_elements(doc_content)

# Get the document text
doc_text = get_document_text(file_id)

# Save the text to a file
with open('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/Prompt Generator Fine Tuning.txt', 'w') as f:
    f.write(doc_text)


# **Fine-Tuning GPT-2 with Custom Dataset**

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import datasets

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add a special padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Function to load the text data
def load_text_data(file_path, tokenizer, block_size=128):
    dataset = datasets.load_dataset('text', data_files=file_path)['train']
    def tokenize_function(examples):
        return tokenizer(examples['text'], return_special_tokens_mask=True)
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

# Prepare dataset
dataset_path = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/Prompt Generator Fine Tuning.txt'
train_dataset = load_text_data(dataset_path, tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt')
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt')


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Step,Training Loss


('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt/added_tokens.json')

# **Generating and Extracting Objects from Text with Fine-Tuned GPT-2**

In [6]:
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
output_dir = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/prompt_generator_gpt'
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir).to('cuda' if torch.cuda.is_available() else 'cpu')

# Function to generate edited prompt
def generate_text(original_prompt, obj, model, tokenizer, max_length=100):
    prompt = f'Original Prompt: "{original_prompt}" Object: "{obj}" Edited Prompt:'
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Function to extract the edited prompt from the generated text
def extract_edited_prompt(text):
    match = re.search(r'Edited Prompt: "(.*?)"', text)
    if match:
        return match.group(1)
    return None

# Unseen test dataset
test_dataset = [
    ("A boy playing with a toy plane on the floor.", "Toy Plane"),
    ("A girl painting with watercolors on paper.", "Watercolors"),
    ("A man building a model airplane on the table.", "Model Airplane"),
    ("A woman sewing a dress with a sewing machine.", "Dress"),
    ("A child playing with a toy train on the tracks.", "Toy Train"),
    ("A boy playing with a toy dinosaur in the room.", "Toy Dinosaur"),
    ("A girl drawing a picture with colored pencils.", "Colored Pencils"),
    ("A man planting flowers in the garden.", "Flowers"),
    ("A woman arranging flowers in a vase.", "Flowers"),
    ("A child building a snowman in the yard.", "Snowman"),
    ("A boy playing with a toy rocket in the park.", "Toy Rocket"),
    ("A girl painting a picture with a paintbrush.", "Paintbrush"),
    ("A man fishing in a river with a fishing rod.", "Fishing Rod"),
    ("A woman baking cookies in the oven.", "Cookies"),
    ("A boy playing with building blocks on the floor.", "Building Blocks")
]

# Test the fine-tuned model with the unseen dataset
for original_prompt, obj in test_dataset:
    generated_text = generate_text(original_prompt, obj, model, tokenizer)
    edited_prompt = extract_edited_prompt(generated_text)
    print(f"Original Prompt: '{original_prompt}' \t Object: '{obj}'")
    print(f"Generated Text: {generated_text}")
    print(f"Extracted Edited Prompt: {edited_prompt}")
    print("===================================")


Original Prompt: 'A boy playing with a toy plane on the floor.' 	 Object: 'Toy Plane'
Generated Text: Original Prompt: "A boy playing with a toy plane on the floor." Object: "Toy Plane" Edited Prompt: "A Toy Plane on the floor." Edited Prompt: "A Toy Plane on the floor." Edited Prompt: "A Toy Plane on the floor." Edited Prompt: "A Toy Plane on the floor." Edited Prompt: "A Toy Plane on the floor." Edited Prompt: "A Toy Plane on the floor." Edited Prompt: "A Toy Plane on the floor." Edited Prompt
Extracted Edited Prompt: A Toy Plane on the floor.
Original Prompt: 'A girl painting with watercolors on paper.' 	 Object: 'Watercolors'
Generated Text: Original Prompt: "A girl painting with watercolors on paper." Object: "Watercolors" Edited Prompt: "A Watercolors on paper." Edited Prompt: "Watercolors on paper." Edited Prompt: "Watercolors on paper." Edited Prompt: "Watercolors on paper." Edited Prompt: "Watercolors on paper." Edited Prompt: "Watercolors on paper." Edited Prompt: "Watercolor