# **Mounting Google Drive in Colab**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Installing Required Packages**

In [2]:
# Install necessary libraries
!pip -q install transformers datasets torch accelerate peft
!pip -q install google-api-python-client google-auth-httplib2 google-auth-oauthlib


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m501.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

# **Load Data from Google Docs and Prepare Dataset**

In [3]:
from google.colab import auth
from googleapiclient.discovery import build
import io

auth.authenticate_user()

# Set up the Drive v3 API
drive_service = build('drive', 'v3')
docs_service = build('docs', 'v1')

# Specify the Google Docs file ID
file_id = '1zXr27_JmtrFGWKrLQrz5P1kw2LegAoLnprEumobc7gU'

# Get the Google Doc content as plain text
def get_document_text(doc_id):
    document = docs_service.documents().get(documentId=doc_id).execute()
    doc_content = document.get('body').get('content')

    def read_paragraph_element(element):
        text_run = element.get('textRun')
        if not text_run:
            return ''
        return text_run.get('content')

    def read_structural_elements(elements):
        text = ''
        for value in elements:
            if 'paragraph' in value:
                elements = value.get('paragraph').get('elements')
                for elem in elements:
                    text += read_paragraph_element(elem)
            elif 'table' in value:
                table = value.get('table')
                for row in table.get('tableRows'):
                    cells = row.get('tableCells')
                    for cell in cells:
                        text += read_structural_elements(cell.get('content'))
            elif 'tableOfContents' in value:
                toc = value.get('tableOfContents')
                text += read_structural_elements(toc.get('content'))
        return text

    return read_structural_elements(doc_content)

# Get the document text
doc_text = get_document_text(file_id)

dataset_path = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/Medium Prompt Generator Fine Tuning.txt'
# Save the text to a file
with open(dataset_path, 'w') as f:
    f.write(doc_text)


# **Fine-Tuning GPT-2 with Custom Dataset**

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Add a special padding token
special_tokens_dict = {'pad_token': '[PAD]'}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

# Load the text data
def load_text_data(file_path, tokenizer, block_size=128):
    dataset = load_dataset('text', data_files=file_path)['train']
    def tokenize_function(examples):
        return tokenizer(examples['text'], return_special_tokens_mask=True, truncation=True, padding='max_length', max_length=block_size)
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

# Prepare dataset

train_dataset = load_text_data(dataset_path, tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

llm_folder_path = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt'
# Define training arguments
training_args = TrainingArguments(
    output_dir=llm_folder_path,
    overwrite_output_dir=True,
    num_train_epochs=10,  # Increase the number of epochs
    per_device_train_batch_size=4,  # Increase batch size
    save_steps=5_000,
    save_total_limit=3,
    logging_steps=200,
    report_to='none'
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model(llm_folder_path)
tokenizer.save_pretrained(llm_folder_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

Step,Training Loss
200,1.0336
400,0.0809


('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt/added_tokens.json')

# **Generating and Extracting Objects from Text with Fine-Tuned GPT-2**

In [5]:
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
output_dir = llm_folder_path
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize the model and resize token embeddings
model = GPT2LMHeadModel.from_pretrained(output_dir)
model.resize_token_embeddings(len(tokenizer))
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Test the fine-tuned model
def generate_text(instruction, prompt,focus_object, model, tokenizer, max_length=100):
    print("\n\nInstruction",instruction)
    formatted_prompt = f"\nOriginal Prompt: \"{prompt}\"\nObject: \"{focus_object}\"\n" ## objects too
    # print("This is formatted_prompt",formatted_prompt)
    input_ids = tokenizer(formatted_prompt, return_tensors='pt').input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Improved extract_objects function
def extract_text_between_first_two_prompts(text):
    # Define the regex pattern to match "Edited Prompt:" entries
    print(text)
    pattern = r'Edited Prompt: ".*?"'

    # Find all matches
    matches = list(re.finditer(pattern, text))

    # Check if there are at least two matches
    if len(matches) >= 2:
        # Extract the text between the first and second "Edited Prompt:" entries
        start = matches[0].end()
        end = matches[1].start()
        result = text[start:end].strip()
        return result
    else:
        return 'Not enough "Edited Prompt:" entries found'

# Test the fine-tuned model with multiple prompts
instruction = "You are given a prompt and an object. The prompt describes a scene in which multiple objects are interacting with each other. The object you are provided with is one of the objects in the prompt. You are to make a simpler prompt from the original one that focuses more on given object and not at the other objects in the original prompt"
prompts =[
    ("A dog playing with a ball in the park.", 'ball'),
    ("A Lion reading a book on a beach.", 'Lion'),
    ("A dog playing with a ball in the park.", 'dog'),
    ("A child building a sandcastle on the beach.", 'sandcastle'),
    ("A chef cooking in a kitchen.", 'kitchen'),
    ("A man painting a fence white.", 'man'),
    ("A woman jogging with a dog in the morning.", 'woman'),
    ("A child playing with a toy train on the floor.", 'toy train'),
    ("A boy flying a kite at the beach.", 'kite'),
    ("A girl drawing with markers on paper.", 'markers'),
    ("A man cooking dinner in the kitchen.", 'man'),
    ("A woman hiking a mountain trail.", 'mountain'),
    ("A child blowing bubbles in the yard.", 'bubbles'),
    ("A boy playing with a remote control car.", 'remote control car'),
    ("A girl reading a comic book.", 'comic book'),
    ("A man riding a horse in the forest.", 'horse')
]


for prompt in prompts:
    generated_text = generate_text(instruction, prompt[0],prompt[1], model, tokenizer)
    # print(f"Original Prompt Text = {prompt}")
    print(generated_text)

    # edited_prompt = extract_text_between_first_two_prompts(generated_text)
    # print(f"Extracted Objects = {edited_prompt}")
    # print("===================================")




Instruction You are given a prompt and an object. The prompt describes a scene in which multiple objects are interacting with each other. The object you are provided with is one of the objects in the prompt. You are to make a simpler prompt from the original one that focuses more on given object and not at the other objects in the original prompt

Original Prompt: "A dog playing with a ball in the park."
Object: "ball"
 Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in the park." Edited Prompt: "A Ball in


Instruction You are given a prompt and an object. The prompt describes a scene in which multiple objects are interacting with each other. The object you are provided with is one of the objects in the prompt. You are to make a simpler prompt from the original one that focuses more on giv