# **Mounting Google Drive in Colab**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Installing Required Packages**

In [None]:
# Install necessary libraries
!pip -q install transformers datasets torch accelerate peft
!pip -q install google-api-python-client google-auth-httplib2 google-auth-oauthlib


# **Load Data from Google Docs and Prepare Dataset**

In [None]:
from google.colab import auth
from googleapiclient.discovery import build
import io

auth.authenticate_user()

# Set up the Drive v3 API
drive_service = build('drive', 'v3')
docs_service = build('docs', 'v1')

# Specify the Google Docs file ID
file_id = '1cppb7fgRoKmOxoiRYiGVcxav4ZGxbgpe_Kj-rme9YZk'

# Get the Google Doc content as plain text
def get_document_text(doc_id):
    document = docs_service.documents().get(documentId=doc_id).execute()
    doc_content = document.get('body').get('content')

    def read_paragraph_element(element):
        text_run = element.get('textRun')
        if not text_run:
            return ''
        return text_run.get('content')

    def read_structural_elements(elements):
        text = ''
        for value in elements:
            if 'paragraph' in value:
                elements = value.get('paragraph').get('elements')
                for elem in elements:
                    text += read_paragraph_element(elem)
            elif 'table' in value:
                table = value.get('table')
                for row in table.get('tableRows'):
                    cells = row.get('tableCells')
                    for cell in cells:
                        text += read_structural_elements(cell.get('content'))
            elif 'tableOfContents' in value:
                toc = value.get('tableOfContents')
                text += read_structural_elements(toc.get('content'))
        return text

    return read_structural_elements(doc_content)

# Get the document text
doc_text = get_document_text(file_id)

dataset_path = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/Medium Prompt Generator Fine Tuning.txt'
# Save the text to a file
with open(dataset_path, 'w') as f:
    f.write(doc_text)


# **Fine-Tuning GPT-2 with Custom Dataset**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Add a special padding token
special_tokens_dict = {'pad_token': '[PAD]'}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

# Load the text data
def load_text_data(file_path, tokenizer, block_size=128):
    dataset = load_dataset('text', data_files=file_path)['train']
    def tokenize_function(examples):
        return tokenizer(examples['text'], return_special_tokens_mask=True, truncation=True, padding='max_length', max_length=block_size)
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

# Prepare dataset

train_dataset = load_text_data(dataset_path, tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

llm_folder_path = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/medium_object_identifier_gpt'
# Define training arguments
training_args = TrainingArguments(
    output_dir=llm_folder_path,
    overwrite_output_dir=True,
    num_train_epochs=10,  # Increase the number of epochs
    per_device_train_batch_size=4,  # Increase batch size
    save_steps=5_000,
    save_total_limit=3,
    logging_steps=200,
    report_to='none'
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model(llm_folder_path)
tokenizer.save_pretrained(llm_folder_path)


# **Generating and Extracting Objects from Text with Fine-Tuned GPT-2**

In [None]:
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
output_dir = llm_folder_path
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize the model and resize token embeddings
model = GPT2LMHeadModel.from_pretrained(output_dir)
model.resize_token_embeddings(len(tokenizer))
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Test the fine-tuned model
def generate_text(instruction, prompt, model, tokenizer, max_length=100):
    formatted_prompt = f"Instruction: {instruction}\nPrompt: \"{prompt}\"" ## objects too
    input_ids = tokenizer(formatted_prompt, return_tensors='pt').input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Improved extract_objects function
def extract_objects(text):
    # Remove extra spaces and special characters
    # text = re.sub(r'\s+', ' ', text)
    # text = re.sub(r'[^\w\s,]', '', text)
    # print("text =",text)

    # Define pattern to capture objects between the first and second occurrence of "\t Objects"
    pattern = r'Objects:\s*"([^"]+)"'

    match = re.search(pattern, text)
    if match:
        # print(f"Match found: {match.group(1)}")
        objects = match.group(1).split(',')
        return sorted([obj.strip() for obj in objects if obj.strip()])
    else:
        # print("No match found.")
        return []

# Test the fine-tuned model with multiple prompts
instruction = "Extract simple and drawable objects from the prompt. Avoid complex objects like backgrounds."
prompts = [
    "A dog playing with a ball in the park.",
    "A Lion reading a book on a beach.",
    "A dog playing with a ball in the park.",
    "A child building a sandcastle on the beach.",
    "A chef cooking in a kitchen.",
    "A man painting a fence white.",
    "A woman jogging with a dog in the morning.",
    "A child playing with a toy train on the floor.",
    "A boy flying a kite at the beach.",
    "A girl drawing with markers on paper.",
    "A man cooking dinner in the kitchen.",
    "A woman hiking a mountain trail.",
    "A child blowing bubbles in the yard.",
    "A boy playing with a remote control car.",
    "A girl reading a comic book.",
    "A man riding a horse in the forest."
]

for prompt in prompts:
    generated_text = generate_text(instruction, prompt, model, tokenizer)
    print(f"Prompt Text = {prompt}")
    print(f"Generated Text = {generated_text}")

    extracted_objects = extract_objects(generated_text)
    print(f"Extracted Objects = {extracted_objects}")
    print("===================================")


Prompt Text = A dog playing with a ball in the park.
Generated Text = Instruction: Extract simple and drawable objects from the prompt. Avoid complex objects like backgrounds.
Prompt: "A dog playing with a ball in the park." \t Objects: "Dog, Ball" \t Objects: "Dog, Ball" \t Objects: "Dog, Ball" \t Objects: "Dog" \t Objects: "Dog" \t Objects: "Dog" \t Objects: "Dog" \t Objects: "Dog" \t Objects:
text = Instruction: Extract simple and drawable objects from the prompt. Avoid complex objects like backgrounds.
Prompt: "A dog playing with a ball in the park." \t Objects: "Dog, Ball" \t Objects: "Dog, Ball" \t Objects: "Dog, Ball" \t Objects: "Dog" \t Objects: "Dog" \t Objects: "Dog" \t Objects: "Dog" \t Objects: "Dog" \t Objects:
Match found: Dog, Ball
Extracted Objects = ['Ball', 'Dog']
Prompt Text = A Lion reading a book on a beach.
Generated Text = Instruction: Extract simple and drawable objects from the prompt. Avoid complex objects like backgrounds.
Prompt: "A Lion reading a book on a