In [1]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)

In [2]:
dataset = load_dataset("squad_v2")

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [3]:
print(dataset['train'][1])

{'id': '56be85543aeaaa14008c9065', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'What areas did Beyonce compete in when she was growing up?', 'answers': {'text': ['singing and dancing'], 'answer_start': [207]}}


In [6]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 3. Preprocess the data
max_length = 512  # Maximum length of input sequences
doc_stride = 128  # Helps in splitting long documents

In [8]:
def preprocess_data(examples):
    # Tokenize the questions and context
    encoding = tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        stride=doc_stride,
        return_tensors="pt"
    )

    # Find the start and end positions of the answer in the context
    start_positions = []
    end_positions = []

    for context, answer in zip(examples['context'], examples['answers']):
        if len(answer['text']) > 0:
            # Get the text of the first answer (assuming one answer per question)
            answer_text = answer['text'][0]
            start_position = context.find(answer_text)
            
            # Ensure the answer exists in the context
            if start_position != -1:
                end_position = start_position + len(answer_text) - 1
            else:
                start_position = 0
                end_position = 0
        else:
            # No valid answer
            start_position = 0
            end_position = 0

        start_positions.append(start_position)
        end_positions.append(end_position)

    # Add start and end positions to the encoding
    encoding['start_positions'] = start_positions
    encoding['end_positions'] = end_positions

    return encoding

In [9]:
# Preprocess both train and validation datasets
train_dataset = dataset["train"].map(preprocess_data, batched=True)
val_dataset = dataset["validation"].map(preprocess_data, batched=True)

# Remove unnecessary columns after processing
train_dataset = train_dataset.remove_columns(["question", "context"])
val_dataset = val_dataset.remove_columns(["question", "context"])

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [10]:
training_args = TrainingArguments(
    output_dir="./distilbert-qa",        # Directory to store the model
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    learning_rate=2e-5,                  # Learning rate for fine-tuning
    per_device_train_batch_size=8,       # Adjust to fit your GPU memory
    per_device_eval_batch_size=8,        # Same as above for evaluation
    num_train_epochs=3,                  # Number of training epochs
    save_strategy="epoch",               # Save model at the end of each epoch
    save_total_limit=2,                  # Limit the number of saved models
    fp16=True,                           # Enable mixed precision for faster training
    logging_dir="./logs",                # Directory for logging
    logging_steps=100,                   # Log every 100 steps
    report_to="none",                   # Avoid using default WandB or TensorBoard
)



In [11]:
# 5. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [12]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,3.6705,2.979045
2,3.4102,2.904812
3,3.2076,2.943974


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=24435, training_loss=3.5678972463215546, metrics={'train_runtime': 11510.4471, 'train_samples_per_second': 33.965, 'train_steps_per_second': 2.123, 'total_flos': 5.107974402921062e+16, 'train_loss': 3.5678972463215546, 'epoch': 3.0})

In [13]:
# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation Results:
eval_loss: 2.943974018096924
eval_runtime: 105.378
eval_samples_per_second: 112.671
eval_steps_per_second: 7.051
epoch: 3.0


In [15]:
test_data = {
    "question": 'When did Beyonce start becoming popular?',
    "context": 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'
}

In [16]:
inputs = tokenizer(
    test_data["question"],
    test_data["context"],
    truncation=True,
    padding="max_length",
    max_length=max_length,
    return_tensors="pt"
).to("cuda")  # Send the data to GPU if available

In [17]:
# Get the model's predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Extract the start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the most probable start and end positions
start_index = torch.argmax(start_logits, dim=1).item()
end_index = torch.argmax(end_logits, dim=1).item()

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
dataset = load_dataset("squad_v2")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 3. Preprocess the data
max_length = 512  # Maximum length of input sequences
doc_stride = 128  # Helps in splitting long documents

def preprocess_data(examples):
    # Tokenize the questions and context
    encoding = tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        stride=doc_stride,
        return_tensors="pt"
    )

    # Find the start and end positions of the answer in the context
    start_positions = []
    end_positions = []

    for context, answer in zip(examples['context'], examples['answers']):
        if len(answer['text']) > 0:
            # Get the text of the first answer (assuming one answer per question)
            answer_text = answer['text'][0]
            start_position = context.find(answer_text)
            
            # Ensure the answer exists in the context
            if start_position != -1:
                end_position = start_position + len(answer_text) - 1
            else:
                start_position = 0
                end_position = 0
        else:
            # No valid answer
            start_position = 0
            end_position = 0

        start_positions.append(start_position)
        end_positions.append(end_position)

    # Add start and end positions to the encoding
    encoding['start_positions'] = start_positions
    encoding['end_positions'] = end_positions

    return encoding

# Preprocess both train and validation datasets
train_dataset = dataset["train"].map(preprocess_data, batched=True)
val_dataset = dataset["validation"].map(preprocess_data, batched=True)

# Remove unnecessary columns after processing
train_dataset = train_dataset.remove_columns(["question", "context"])
val_dataset = val_dataset.remove_columns(["question", "context"])

training_args = TrainingArguments(
    output_dir="./distilbert-qa",        # Directory to store the model
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    learning_rate=2e-5,                  # Learning rate for fine-tuning
    per_device_train_batch_size=8,       # Adjust to fit your GPU memory
    per_device_eval_batch_size=8,        # Same as above for evaluation
    num_train_epochs=3,                  # Number of training epochs
    save_strategy="epoch",               # Save model at the end of each epoch
    save_total_limit=2,                  # Limit the number of saved models
    fp16=True,                           # Enable mixed precision for faster training
    logging_dir="./logs",                # Directory for logging
    logging_steps=100,                   # Log every 100 steps
    report_to="none",                   # Avoid using default WandB or TensorBoard
)

# 5. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

In [2]:
pip install diffusers

Collecting diffusers
  Downloading diffusers-0.31.0-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.31.0-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.31.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
from transformers import CLIPTextModel, CLIPTokenizer

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download and prepare CIFAR10 dataset
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resizing to match the model's input size
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5]),  # Normalize to [-1, 1]
])

dataset = datasets.CIFAR10(root="./data", train=True, transform=transform, download=True)
train_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Model components
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae").to(device)
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet").to(device)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32").to("cpu")  # Offloaded to CPU
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

# Noise scheduler
noise_scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

# Optimizer
optimizer = torch.optim.AdamW(unet.parameters(), lr=5e-5)

# Mixed precision setup
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()

# Training Loop
epochs = 3  # Reduced number of epochs for faster training
accumulation_steps = 4  # For gradient accumulation

for epoch in range(epochs):
    for i, (images, _) in enumerate(train_loader):
        # CIFAR10 images don't have captions; use a default placeholder caption
        captions = ["A colorful object."] * images.size(0)
        images = images.to(device)

        with autocast():
            # Preprocess text input
            inputs = tokenizer(captions, return_tensors="pt", padding=True).to(device)
            text_embeds = text_encoder(**inputs).last_hidden_state

            # Add noise to the image
            noise = torch.randn_like(images)
            timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (1,), device=device).long()
            noisy_image = noise_scheduler.add_noise(images, noise, timesteps)

            # Predict noise using UNet
            noise_pred = unet(noisy_image, timesteps, encoder_hidden_states=text_embeds).sample

            # Compute loss
            loss = torch.nn.functional.mse_loss(noise_pred, noise) / accumulation_steps

        # Backpropagation
        scaler.scale(loss).backward()
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        print(f"Epoch [{epoch + 1}/{epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item()}")

# Save the fine-tuned model
unet.save_pretrained("path_to_save_fine_tuned_unet")
vae.save_pretrained("path_to_save_fine_tuned_vae")
text_encoder.save_pretrained("path_to_save_fine_tuned_text_encoder")
tokenizer.save_pretrained("path_to_save_fine_tuned_tokenizer")


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_images'