In [None]:
import sys
import os
# Ensure we can import from src (assuming notebook is in 'notebooks' dir)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import wandb
from src.utils import load_env_vars
from src.dataset import Cifar10DataManager

# Load Env (Colab users: Set these manually or upload .env)
env = load_env_vars()
# If .env failed (Colab), set standard keys
if os.getenv("WANDB_API_KEY") is None:
    print("Please login to wandb manually or set environment variables.")
    wandb.login()

PROJECT_NAME = env.get("WANDB_PROJECT", "cifar10_mlops_project")

# Initialize Data Manager
# We download to '../data' relative to this notebook
dm = Cifar10DataManager(data_dir="../data")

# 1. Download & Prepare Initial Split
# This downloads CIFAR-10 and creates the 40k/8k/2k split indices
print("Downloading and splitting data...")
dm.prepare_initial_split()

# 2. Versioning with W&B
run = wandb.init(project=PROJECT_NAME, job_type="data_preparation", name="cifar10_v1")

# We create an artifact that contains the entire data directory (Raw images + Split Indices)
dataset_artifact = wandb.Artifact(
    name="cifar10_dataset", 
    type="dataset", 
    description="CIFAR-10 Raw Data + Split Indices (Train/Test/Sim)"
)

# Add the data directory to the artifact
dataset_artifact.add_dir("../data")

# Log it
run.log_artifact(dataset_artifact)
run.finish()

print("Step 1 Complete: Dataset v1 logged to W&B.")

# Data Preparation and Versioning

This notebook downloads the CIFAR-10 dataset and versions it using Weights & Biases Artifacts.


In [None]:
import wandb
import torchvision
import os

# Project Configuration
PROJECT_NAME = "cifar10_mlops_project"
ENTITY = None # Set this to your username if needed, usually inferred
ARTIFACT_NAME = "cifar10-raw-data"
DATA_DIR = "../data/raw"

# Create data directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
# Initialize W&B Run for Data Preparation
run = wandb.init(project=PROJECT_NAME, job_type="data-preparation")
wandb.login()

In [None]:
# Download CIFAR-10 Dataset
print("Downloading CIFAR-10 dataset...")
# We use torchvision to download, it creates a folder 'cifar-10-batches-py' inside DATA_DIR
dataset = torchvision.datasets.CIFAR10(root=DATA_DIR, train=True, download=True)
test_dataset = torchvision.datasets.CIFAR10(root=DATA_DIR, train=False, download=True)
print("Download complete.")

In [None]:
# Create a W&B Artifact
artifact = wandb.Artifact(name=ARTIFACT_NAME, type="dataset", description="Raw CIFAR-10 dataset from torchvision")

# Add the directory containing the dataset to the artifact
# Torchvision CIFAR10 extracts to a folder inside root, usually. 
# Let's add the whole DATA_DIR content to be sure we capture it.
artifact.add_dir(DATA_DIR)

# Log the artifact to W&B
print("Logging artifact to W&B...")
run.log_artifact(artifact)
print("Artifact logged successfully.")

In [None]:
wandb.finish()