In [None]:
!pip install datasets

In [14]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [55]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [34]:
from datasets import load_dataset

dataset = load_dataset("jawerty/html_dataset")

In [35]:
data = dataset["train"].train_test_split(test_size=0.2)

In [36]:
data

DatasetDict({
    train: Dataset({
        features: ['label', 'html'],
        num_rows: 34
    })
    test: Dataset({
        features: ['label', 'html'],
        num_rows: 9
    })
})

In [37]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW


In [38]:
class MyDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        return {
            "label": example["label"],
            "html": example["html"]
        }

In [39]:
# Load your dataset
# Replace the following line with loading your actual dataset
train_examples = data['train']
test_examples = data['test']
# Create DataLoader for training and test sets
train_dataset = MyDataset(train_examples)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

test_dataset = MyDataset(test_examples)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [40]:
# Define T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name, src_lang="en", tgt_lang="html")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)



In [44]:
from rouge_score import rouge_scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [45]:
# Training loop
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0

    for batch in train_dataloader:
        # Tokenize input and output
        input_texts = batch["label"]
        target_texts = batch["html"]

        inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        targets = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Forward pass
        outputs = model(**inputs, labels=targets["input_ids"])
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}")


Epoch 1, Train Loss: 7.473968452877468
Epoch 2, Train Loss: 6.186974472469753
Epoch 3, Train Loss: 5.838557667202419
Epoch 4, Train Loss: 5.4702241155836315
Epoch 5, Train Loss: 5.565755473242866
Epoch 6, Train Loss: 5.063986778259277
Epoch 7, Train Loss: 5.14213498433431
Epoch 8, Train Loss: 5.00471215777927
Epoch 9, Train Loss: 4.8204479747348365
Epoch 10, Train Loss: 4.609102831946479
Epoch 11, Train Loss: 4.606663015153673
Epoch 12, Train Loss: 4.403062873416477
Epoch 13, Train Loss: 4.455058309766981
Epoch 14, Train Loss: 4.349659946229723
Epoch 15, Train Loss: 4.363509045706855
Epoch 16, Train Loss: 4.205103026496039
Epoch 17, Train Loss: 4.230042086707221
Epoch 18, Train Loss: 4.085311386320326
Epoch 19, Train Loss: 4.127061261071099
Epoch 20, Train Loss: 4.101379844877455
Epoch 21, Train Loss: 4.055003219180637
Epoch 22, Train Loss: 3.929269128375583
Epoch 23, Train Loss: 3.94476490550571
Epoch 24, Train Loss: 3.9423827860090466
Epoch 25, Train Loss: 3.8225817150539823
Epoch 26

In [57]:
from rouge import Rouge
model.eval()
total_rouge_score = 0.0
rouge_scorer = Rouge()

with torch.no_grad():
    for batch in test_dataloader:
        input_texts = batch["label"]
        target_texts = batch["html"]

        # Join the lists into strings
        input_texts = " ".join([" ".join(inputs) for inputs in input_texts])
        target_texts = " ".join([" ".join(targets) for targets in target_texts])

        inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        targets = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

        outputs = model.generate(**inputs)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]  # Assuming batch size 1

        # Compute ROUGE score using the rouge library
        rouge_scores = rouge_scorer.get_scores(generated_text, target_texts, avg=True)
        total_rouge_score += rouge_scores["rouge-l"]["f"]

avg_rouge_score = total_rouge_score / len(test_dataloader)
print(f"Average ROUGE Score: {avg_rouge_score}")


Average ROUGE Score: 0.11716462798486106


In [58]:
# Save the trained model
model.save_pretrained("t5_fine_tuned")