# Data Preparation and Versioning

This notebook downloads the CIFAR-10 dataset and versions it using Weights & Biases Artifacts.


In [None]:
import wandb
import torchvision
import os

# Project Configuration
PROJECT_NAME = "cifar10_mlops_project"
ENTITY = None # Set this to your username if needed, usually inferred
ARTIFACT_NAME = "cifar10-raw-data"
DATA_DIR = "../data/raw"

# Create data directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
# Initialize W&B Run for Data Preparation
run = wandb.init(project=PROJECT_NAME, job_type="data-preparation")
wandb.login()

In [None]:
# Download CIFAR-10 Dataset
print("Downloading CIFAR-10 dataset...")
# We use torchvision to download, it creates a folder 'cifar-10-batches-py' inside DATA_DIR
dataset = torchvision.datasets.CIFAR10(root=DATA_DIR, train=True, download=True)
test_dataset = torchvision.datasets.CIFAR10(root=DATA_DIR, train=False, download=True)
print("Download complete.")

In [None]:
# Create a W&B Artifact
artifact = wandb.Artifact(name=ARTIFACT_NAME, type="dataset", description="Raw CIFAR-10 dataset from torchvision")

# Add the directory containing the dataset to the artifact
# Torchvision CIFAR10 extracts to a folder inside root, usually. 
# Let's add the whole DATA_DIR content to be sure we capture it.
artifact.add_dir(DATA_DIR)

# Log the artifact to W&B
print("Logging artifact to W&B...")
run.log_artifact(artifact)
print("Artifact logged successfully.")

In [None]:
wandb.finish()