In [1]:
!pip install transformers datasets pillow torch torchvision




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

# Set a new cache directory
os.environ["HF_HOME"] = "cache"


In [4]:
print(os.environ["HF_HOME"])

cache


In [None]:
from datasets import disable_caching

disable_caching()

# Define image transformations
image_transform = Compose([
    Resize((256, 256)),  # Resize image to 256x256
    ToTensor(),          # Convert image to tensor
    Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize to [-1, 1]
])

# Function to process the image and ensure it has 3 channels (RGB)
def process_image(image):
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return image_transform(image)

# Preprocess function to handle image and text input
def preprocess_function(examples):
    """
    Preprocess function to handle image and text input.
    The `image` is processed into tensors, and the `html_css` is tokenized.

    Args:
        examples (dict): Contains `image` and `html_css`.

    Returns:
        dict: Processed image tensors and tokenized labels.
    """
    # Process image
    image = examples["image"]  # Image column
    image_tensor = process_image(image)

    # Tokenize the HTML/CSS text
    targets = examples["text"]
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    return {"image_tensor": image_tensor, "labels": labels["input_ids"]}

# Load your dataset using streaming (replace with the actual dataset name or path)
raw_dataset = load_dataset("HuggingFaceM4/WebSight", "v0.1", split="train", cache_dir="D:/Facultate/SII/Design2Code/cache")

# Convert to iterable dataset for streaming
iterable_dataset = raw_dataset.to_iterable_dataset()

# Tokenizer for T5 model
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Process data and save intermediate results to disk to avoid memory overload
def process_and_save_data(dataset, max_rows=1000, output_file="processed_data.json"):
    """
    Process data row by row and save the output to disk to avoid memory overload.
    """
    processed_data = []
    for i, row in enumerate(dataset):
        if i >= max_rows:
            break
        # Preprocess the row
        processed_row = preprocess_function(row)
        processed_data.append(processed_row)

        # Save intermediate result to disk
        with open(output_file, "a") as f:
            json.dump(processed_row, f)
            f.write("\n")  # Write each entry on a new line to separate the entries

        # Optionally clean up memory after each batch to prevent memory overload
        if (i + 1) % 100 == 0:  # Save and clear every 100 rows
            del processed_data  # Delete processed data to free memory
            gc.collect()  # Explicitly call garbage collection

    return processed_data  # Return processed data (optional if you need it in memory)

# Process the dataset in batches and save results to disk
processed_data = process_and_save_data(iterable_dataset, max_rows=1000, output_file="processed_data.json")

# Split dataset manually into training and validation
data_split = int(0.9 * len(processed_data))
train_dataset = processed_data[:data_split]
eval_dataset = processed_data[data_split:]

# Example of how you might further process or train your model
# (this is just a placeholder and assumes you're using a deep learning framework)
# train_model(train_dataset)

Downloading data:   0%|          | 0/71 [00:00<?, ?files/s]
