In [1]:
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Creating a Custom Dataset

In [2]:
class MySyntheticDataset(Dataset):
    """
    A custom Dataset class for our synthetic data.
    It needs to implement __init__, __len__, and __getitem__.
    """
    def __init__(self, num_samples=1000, input_features=5, transform=None):
        """
        Args:
            num_samples (int): Number of samples to generate.
            input_features (int): Number of features for input X.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        print(f"Initializing MySyntheticDataset with {num_samples} samples...")
        # Generate some synthetic data (replace with loading your actual data)
        self.X = torch.randn(num_samples, input_features)

        # Example target: Y = sum of first two features + some noise
        self.Y = self.X[:, 0] + self.X[:, 1] + torch.randn(num_samples) * 0.5
        self.Y = self.Y.unsqueeze(1) # Reshape Y to be [num_samples, 1] for consistency

        self.num_samples = num_samples
        self.transform = transform
        print("Dataset initialized.")

    def __len__(self):
        # Should return the total number of samples in the dataset
        return self.num_samples

    def __getitem__(self, idx):
        # Should return the sample (input features and target) at the given index 'idx'
        # This is where you'd typically load data from disk if needed (e.g., image, text)

        # Retrieve the specific sample
        sample_x = self.X[idx]
        sample_y = self.Y[idx]

        # Apply transformations if any (common for image augmentation etc.)
        if self.transform:
            # Note: Transform logic depends heavily on the data type
            # For now, we'll assume transform takes the whole sample tuple
            sample_x, sample_y = self.transform((sample_x, sample_y))

        return sample_x, sample_y

# 2. Instantiate the Dataset

In [3]:
my_dataset = MySyntheticDataset(num_samples=500, input_features=8)

print(f"Dataset length: {len(my_dataset)}")
first_sample_x, first_sample_y = my_dataset[0]
print(f"First sample X shape: {first_sample_x.shape}") # Should be [input_features]
print(f"First sample Y shape: {first_sample_y.shape}") # Should be [1]


Initializing MySyntheticDataset with 500 samples...
Dataset initialized.
Dataset length: 500
First sample X shape: torch.Size([8])
First sample Y shape: torch.Size([1])


# 3. Using the DataLoader

In [6]:
# DataLoader takes a Dataset and provides batched, shuffled iteration.

batch_size = 32 # Number of samples per batch
# shuffle=True is important for training to ensure batches are different each epoch
my_dataloader = DataLoader(dataset=my_dataset,
                           batch_size=batch_size,
                           shuffle=True,
                           num_workers=0) # num_workers > 0 uses subprocesses for data loading

In [7]:
# Iterate over the DataLoader to get batches
data_iter = iter(my_dataloader)

In [8]:
# Get the first batch
first_batch_inputs, first_batch_targets = next(data_iter)
print("First batch:")
print(f"  Input shape: {first_batch_inputs.shape}")  # Should be [batch_size, input_features]
print(f"  Target shape: {first_batch_targets.shape}") # Should be [batch_size, 1]

First batch:
  Input shape: torch.Size([32, 8])
  Target shape: torch.Size([32, 1])


In [9]:
# Example loop
print("Example loop over batches:")
for i, (batch_inputs, batch_targets) in enumerate(my_dataloader):
    # In a real training loop, you'd do the forward/backward pass here
    if i < 3: # Print info for the first 3 batches
        print(f"Batch {i+1}: Inputs shape {batch_inputs.shape}, Targets shape {batch_targets.shape}")
    elif i == 3:
        print("...") # Stop printing after a few batches
        break

Example loop over batches:
Batch 1: Inputs shape torch.Size([32, 8]), Targets shape torch.Size([32, 1])
Batch 2: Inputs shape torch.Size([32, 8]), Targets shape torch.Size([32, 1])
Batch 3: Inputs shape torch.Size([32, 8]), Targets shape torch.Size([32, 1])
...


# 4. How the Training Loop Changes (Conceptual)

In [None]:
# Previous training loop (`06_nn_basics`) iterated once per epoch using ALL data.
# With a DataLoader, the loop iterates over BATCHES within each epoch.

# model = ...
# loss_fn = ...
# optimizer = ...
# num_epochs = ...
# device = ... # 'cuda' or 'cpu'

# for epoch in range(num_epochs):
#     model.train() # Set model to training mode
#     total_loss = 0
#     num_batches = len(my_dataloader)

#     # Loop over batches provided by DataLoader
#     for batch_idx, (batch_inputs, batch_targets) in enumerate(my_dataloader):
#         # Move data to the appropriate device (CPU/GPU)
#         # batch_inputs = batch_inputs.to(device)
#         # batch_targets = batch_targets.to(device)

#         # 1. Forward pass
#         outputs = model(batch_inputs)
#         # 2. Calculate loss
#         loss = loss_fn(outputs, batch_targets)
#         # 3. Backward pass
#         loss.backward()
#         # 4. Optimizer step
#         optimizer.step()
#         # 5. Zero gradients
#         optimizer.zero_grad()

#         total_loss += loss.item()

#         # Optional: Print batch progress
#         # if (batch_idx + 1) % 10 == 0:
#         #     print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{num_batches}], Loss: {loss.item():.4f}')

#     avg_loss = total_loss / num_batches
#     print(f"Epoch {epoch+1}/{num_epochs} Finished, Average Loss: {avg_loss:.4f}")

#     # Optional: Validation Loop would go here
#     # model.eval() # Set model to evaluation mode
#     # with torch.no_grad():
#     #     # Loop over validation dataloader
#     #     # Calculate validation loss/metrics
