# **Mounting Google Drive in Colab**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Installing Required Packages**

In [2]:
!pip -q install transformers datasets torch
!pip -q install transformers[torch] accelerate -U
!pip -q install google-api-python-client google-auth-httplib2 google-auth-oauthlib

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source

# **Load Data from Google Docs and Prepare Dataset**

In [3]:
from google.colab import auth
from googleapiclient.discovery import build
import io

auth.authenticate_user()

# Set up the Drive v3 API
drive_service = build('drive', 'v3')
docs_service = build('docs', 'v1')

# Specify the Google Docs file ID (you can get this from the URL of the Google Doc)
file_id = '1ActMd8s0tWUaYAfuyWAQZJwu14uH5h8-t5_u5bVdi6A'

# Get the Google Doc content as plain text
def get_document_text(doc_id):
    document = docs_service.documents().get(documentId=doc_id).execute()
    doc_content = document.get('body').get('content')

    def read_paragraph_element(element):
        text_run = element.get('textRun')
        if not text_run:
            return ''
        return text_run.get('content')

    def read_structural_elements(elements):
        text = ''
        for value in elements:
            if 'paragraph' in value:
                elements = value.get('paragraph').get('elements')
                for elem in elements:
                    text += read_paragraph_element(elem)
            elif 'table' in value:
                # The text in table cells.
                table = value.get('table')
                for row in table.get('tableRows'):
                    cells = row.get('tableCells')
                    for cell in cells:
                        text += read_structural_elements(cell.get('content'))
            elif 'tableOfContents' in value:
                # The text in the table of contents.
                toc = value.get('tableOfContents')
                text += read_structural_elements(toc.get('content'))
        return text

    return read_structural_elements(doc_content)

# Get the document text
doc_text = get_document_text(file_id)

# Save the text to a file
with open('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/Object Identifier Fine Tunning Dataset.txt', 'w') as f:
    f.write(doc_text)


# **Fine-Tuning GPT-2 with Custom Dataset**

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import datasets

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add a special padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Function to load the text data
def load_text_data(file_path, tokenizer, block_size=128):
    dataset = datasets.load_dataset('text', data_files=file_path)['train']
    def tokenize_function(examples):
        return tokenizer(examples['text'], return_special_tokens_mask=True)
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

# Prepare dataset
dataset_path = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/Object Identifier Fine Tunning Dataset.txt'
train_dataset = load_text_data(dataset_path, tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt',
    overwrite_output_dir=True,
    num_train_epochs=5,  # Increase the number of epochs for better fine-tuning
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt')
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt')


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/255 [00:00<?, ? examples/s]

Step,Training Loss
500,1.1916


('/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt/added_tokens.json')

# **Generating and Extracting Objects from Text with Fine-Tuned GPT-2**

In [5]:
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
output_dir = '/content/drive/MyDrive/Colab Notebooks/Dense Diffusion/object_identifier_gpt'
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir).to('cuda' if torch.cuda.is_available() else 'cpu')

# Test the fine-tuned model
def generate_text(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Extract objects from the generated text
def extract_objects(text):
    objects_pattern = re.compile(r'Objects:\s*"([^"]*)"')
    objects_matches = objects_pattern.findall(text)

    # Consolidate all objects into a single set to remove duplicates
    unique_objects = set()
    for match in objects_matches:
        objects = match.split(', ')
        unique_objects.update(objects)

    return sorted(unique_objects)

# Test the fine-tuned model with multiple prompts
prompts = [
    "A Lion reading a book on a beach.",
    "A dog playing with a ball in the park.",
    "A child building a sandcastle on the beach.",
    "A chef cooking in a kitchen.",
    "A man painting a fence white.",
    "A woman jogging with a dog in the morning.",
    "A child playing with a toy train on the floor.",
    "A boy flying a kite at the beach.",
    "A girl drawing with markers on paper.",
    "A man cooking dinner in the kitchen.",
    "A woman hiking a mountain trail.",
    "A child blowing bubbles in the yard.",
    "A boy playing with a remote control car.",
    "A girl reading a comic book.",
    "A man riding a horse in the forest."
]

for prompt in prompts:
    generated_text = generate_text(prompt, model, tokenizer)
    print(f"Generated Text for prompt '{prompt}': {generated_text}")

    extracted_objects = extract_objects(generated_text)
    print(f"Extracted Objects for prompt '{prompt}': {extracted_objects}")
    print("===================================")


Generated Text for prompt 'A Lion reading a book on a beach.': A Lion reading a book on a beach. "A Lion, Book, Beach" \t Objects: "Lion, Beach" \t Objects: "Lion, Book, Beach" \t Objects: "Lion, Beach" \t Objects: "Lion, Beach" \t Objects: "Lion, Beach" \t Objects: "Lion, Beach" \t Objects: "Lion, Beach" \t Objects: "Lion, Beach" \
Extracted Objects for prompt 'A Lion reading a book on a beach.': ['Beach', 'Book', 'Lion']
Generated Text for prompt 'A dog playing with a ball in the park.': A dog playing with a ball in the park. "A dog playing with a ball in the park." \t Objects: "Dog, Ball, Park" \t Objects: "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "Dog, Ball, Park" "
Extracted Objects for prompt 'A dog playing with a ball in the park.': ['Ball', 'Dog', 'Park']
Generated Text for prompt 'A child building a sandcastle on the beach.': A child building a sandcastle on the beach. "A chil