In [1]:
# Install necessary dependencies
!pip -q install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 torch torchtext sentencepiece pandas tqdm datasets


import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import load_dataset
from tqdm import tqdm
import time

# Load dataset
# data_sample = load_dataset("Leonardorm7/PP")
data_sample = load_dataset("Leonardorm7/PPAPI")

# Update dataset structure
updated_data = [{'Input': item['Input'], 'Python code': item['Python code']} for item in data_sample['train']]
df = pd.DataFrame(updated_data)

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)
tokenizer.pad_token = tokenizer.eos_token

# Dataset class
class LanguageDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]['Input']
        y = self.data[idx]['Python code']
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

# Prepare dataset
data_sample = LanguageDataset(df, tokenizer)
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size
train_data, valid_data = random_split(data_sample, [train_size, valid_size])

# Create loaders
BATCH_SIZE = 8
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

# Training configuration
num_epochs = 10
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)

# DataFrame for results
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_sec'])

# Training loop
for epoch in range(num_epochs):
    start_time = time.time()

    # Training
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {BATCH_SIZE}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        outputs = model(input_ids=inputs, labels=inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    model.eval()
    epoch_validation_loss = 0
    valid_iterator = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            outputs = model(input_ids=inputs, labels=inputs)
            loss = outputs.loss
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()
    avg_epoch_validation_loss = epoch_validation_loss / len(valid_iterator)

    end_time = time.time()
    epoch_duration_sec = end_time - start_time

    new_row = {
        'epoch': epoch+1,
        'transformer': 'distilgpt2',
        'batch_size': BATCH_SIZE,
        'gpu': 0,
        'training_loss': avg_epoch_training_loss,
        'validation_loss': avg_epoch_validation_loss,
        'epoch_duration_sec': epoch_duration_sec
    }
    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {avg_epoch_validation_loss}")


[2024-07-07 23:49:55,963] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Training Epoch 1/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 13.16it/s, Training Loss=0.538]
Validation Epoch 1/10: 100%|██████████| 6/6 [00:00<00:00, 53.16it/s, Validation Loss=0.237]


Epoch: 1, Validation Loss: 0.3569177861015002


Training Epoch 2/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 18.54it/s, Training Loss=0.315]
Validation Epoch 2/10: 100%|██████████| 6/6 [00:00<00:00, 57.35it/s, Validation Loss=0.21]


Epoch: 2, Validation Loss: 0.24269748975833258


Training Epoch 3/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 17.53it/s, Training Loss=0.176]
Validation Epoch 3/10: 100%|██████████| 6/6 [00:00<00:00, 53.51it/s, Validation Loss=0.144]


Epoch: 3, Validation Loss: 0.2240831802288691


Training Epoch 4/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 19.04it/s, Training Loss=0.213]
Validation Epoch 4/10: 100%|██████████| 6/6 [00:00<00:00, 62.85it/s, Validation Loss=0.139]


Epoch: 4, Validation Loss: 0.20492582519849142


Training Epoch 5/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 19.39it/s, Training Loss=0.118]
Validation Epoch 5/10: 100%|██████████| 6/6 [00:00<00:00, 55.71it/s, Validation Loss=0.103]


Epoch: 5, Validation Loss: 0.19419589390357336


Training Epoch 6/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 19.20it/s, Training Loss=0.256]
Validation Epoch 6/10: 100%|██████████| 6/6 [00:00<00:00, 62.02it/s, Validation Loss=0.125]


Epoch: 6, Validation Loss: 0.20991607010364532


Training Epoch 7/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 19.31it/s, Training Loss=0.136] 
Validation Epoch 7/10: 100%|██████████| 6/6 [00:00<00:00, 63.33it/s, Validation Loss=0.12]


Epoch: 7, Validation Loss: 0.19418700287739435


Training Epoch 8/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 19.02it/s, Training Loss=0.102] 
Validation Epoch 8/10: 100%|██████████| 6/6 [00:00<00:00, 60.65it/s, Validation Loss=0.114]


Epoch: 8, Validation Loss: 0.20586994414528212


Training Epoch 9/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 18.37it/s, Training Loss=0.101] 
Validation Epoch 9/10: 100%|██████████| 6/6 [00:00<00:00, 51.78it/s, Validation Loss=0.143]


Epoch: 9, Validation Loss: 0.21140062560637793


Training Epoch 10/10 Batch Size: 8: 100%|██████████| 20/20 [00:01<00:00, 19.11it/s, Training Loss=0.138] 
Validation Epoch 10/10: 100%|██████████| 6/6 [00:00<00:00, 60.70it/s, Validation Loss=0.184]

Epoch: 10, Validation Loss: 0.21557331085205078





# Trained model test

In [2]:

input_str="Create a 5-slide presentation with yellow background, each slide with Slide Title in Calibri size 45 and a bullet list containing four points in Calibri size 20"
input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)
output = model.generate(input_ids, max_length=3024, num_return_sequences=1, do_sample=True, top_k=8, top_p=0.95, temperature=0.5, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print(decoded_output)

Create a 5-slide presentation with yellow background, each slide with Slide Title in Calibri size 45 and a bullet list containing four points in Calibri size 20 | api.create_presentation('Yellow Slides')
titles = ["Slide 1", "Slide 2", "Slide 3",                                              else: if i %(layout='title and content')
for i in range(5):
api.change_bullet_points(4):



# Few shot Answer

In [3]:

few_shot_examples = [
    {
        "instruction": "Create a 4 slide presentation with light green background, each slide with Slide Title in Times New Roman size 40 and a bullet list containing two points in Times New Roman size 30",
        "response": """
api.create_presentation('Light Green Slides')
titles = ["Slide 1", "Slide 2", "Slide 3", "Slide 4"]
bullet_points = [
    ["Point 1", "Point 2"],
    ["Item A", "Item B"],
    ["Fact 1", "Fact 2"],
    ["Detail X", "Detail Y"]
]
for i in range(4):
    api.add_slide(layout='title and content')
    api.change_background_color(i + 1, (144, 238, 144))
    api.add_text_to_slide(i + 1, titles[i], placeholder=0)
    for point in bullet_points[i]:
        api.add_bullet_point(i + 1, point)
    api.change_font(i + 1, 0, 'Times New Roman')
    api.change_font_size(i + 1, 0, 40)
    api.change_font(i + 1, 1, 'Times New Roman')
    api.change_font_size(i + 1, 1, 30)
api.save_presentation('light_green_slides.pptx')
        """
    },
    {
        "instruction": "Create a 3 slide presentation with blue background, each slide with Slide Title in Arial size 35 and a bullet list containing three points in Arial size 25",
        "response": """
api.create_presentation('Blue Slides')
titles = ["Slide 1", "Slide 2", "Slide 3"]
bullet_points = [
    ["Point A", "Point B", "Point C"],
    ["Item 1", "Item 2", "Item 3"],
    ["Detail X", "Detail Y", "Detail Z"]
]
for i in range(3):
    api.add_slide(layout='title and content')
    api.change_background_color(i + 1, (0, 0, 255))
    api.add_text_to_slide(i + 1, titles[i], placeholder=0)
    for point in bullet_points[i]:
        api.add_bullet_point(i + 1, point)
    api.change_font(i + 1, 0, 'Arial')
    api.change_font_size(i + 1, 0, 35)
    api.change_font(i + 1, 1, 'Arial')
    api.change_font_size(i + 1, 1, 25)
api.save_presentation('blue_slides.pptx')
        """
    }
]

# Función para crear el prompt few-shot
def create_few_shot_prompt(instruction, examples):
    prompt = "Below are some examples of how to create presentations:\n\n"
    for example in examples:
        prompt += f"### Instruction:\n{example['instruction']}\n### Response:\n{example['response']}\n\n"
    prompt += "### Instruction:\n" + instruction + "\n### Response:\n"
    return prompt


input_str="Create a 5-slide presentation with yellow background, each slide with Slide Title in Calibri size 45 and a bullet list containing four points in Calibri size 20"

few_shot_prompt = create_few_shot_prompt(input_str, few_shot_examples)

input_ids = tokenizer.encode(few_shot_prompt, return_tensors='pt').to(device)

# To reduce the number of examples
few_shot_examples = [few_shot_examples[0]]  # to use only the first example

output = model.generate(
    input_ids, 
    max_length=4096,  
    num_return_sequences=1, 
    do_sample=True, 
    top_k=50,  
    top_p=0.95, 
    temperature=0.7,  
    repetition_penalty=1.2, 
    pad_token_id=tokenizer.eos_token_id
)

print(f"Length of input_ids: {len(input_ids[0])}")

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)


input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)
output = model.generate(input_ids, max_length=1024)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Length of input_ids: 766
Below are some examples of how to create presentations:

### Instruction:
Create a 4 slide presentation with light green background, each slide with Slide Title in Times New Roman size 40 and a bullet list containing two points in Times New Roman size 30
### Response:

api.create_presentation('Light Green Slides')
titles = ["Slide 1", "Slide 2", "Slide 3", "Slide 4"]
bullet_points = [
    ["Point 1", "Point 2"],
    ["Item A", "Item B"],
    ["Fact 1", "Fact 2"],
    ["Detail X", "Detail Y"]
]
for i in range(4):
    api.add_slide(layout='title and content')
    api.change_background_color(i + 1, (144, 238, 144))
    api.add_text_to_slide(i + 1, titles[i], placeholder=0)
    for point in bullet_points[i]:
        api.add_bullet_point(i + 1, point)
    api.change_font(i + 1, 0, 'Times New Roman')
    api.change_font_size(i + 1, 0, 40)
    api.change_font(i + 1, 1, 'Times New Roman')
    api.change_font_size(i + 1, 1, 30)
api.save_presentation('light_green_slides.