In [None]:
import os
import numpy as np
import torchvision
import torchvision.transforms as transforms
import wandb

# ==========================================
# 1. Environment Setup
# ==========================================
# Credentials
WANDB_API_KEY = "API KEY"
PROJECT_NAME = "PROJECTNAME"
ENTITY = "ENTITY"

print(f"Logging in to W&B Project: {PROJECT_NAME} (Entity: {ENTITY})")
wandb.login(key=WANDB_API_KEY)



Logging in to W&B Project: cifar10_mlops_project
Downloading and splitting data...
Downloading/Loading data in ./data...
Data split indices created.


[34m[1mwandb[0m: Adding directory to artifact (data)... Done. 0.8s


AuthenticationError: API key verification failed for host https://api.wandb.ai. Make sure your API key is valid.

[1;34mwandb[0m: 
[1;34mwandb[0m: ðŸš€ View run [33mcifar10_v1[0m at: [34mhttps://wandb.ai/amirbnsl/cifar10_mlops_project/runs/4pbt9o5u[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20260129_225436-4pbt9o5u/logs[0m
[1;34mwandb[0m: 
[1;34mwandb[0m: ðŸš€ View run [33mcifar10_v1[0m at: [34mhttps://wandb.ai/esi-sba-dz/cifar10_mlops_project/runs/6jvf7237[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20260129_225956-6jvf7237/logs[0m


socket.send() raised exception.


In [None]:
# ==========================================
# 2. Data Manager Definition
# ==========================================
class Cifar10DataManager:
    def __init__(self, data_dir="./data"):
        self.data_dir = data_dir
    
    def prepare_initial_split(self):
        """
        Downloads CIFAR-10 from Torchvision (Source).
        Creating the reproducibility split indices.
        """
        print(f"Downloading/Loading data in {self.data_dir}...")
        # We ALLOW download=True here because this is the Genesis step
        train_set = torchvision.datasets.CIFAR10(root=self.data_dir, train=True, download=True)
        test_set = torchvision.datasets.CIFAR10(root=self.data_dir, train=False, download=True)
        
        # Split Test Set logic
        indices = list(range(len(test_set)))
        np.random.seed(42)
        np.random.shuffle(indices)
        
        test_indices = indices[:8000]
        sim_indices = indices[8000:]
        
        # Save indices locally
        processed_dir = os.path.join(self.data_dir, "processed")
        os.makedirs(processed_dir, exist_ok=True)
        np.save(os.path.join(processed_dir, "test_indices.npy"), test_indices)
        np.save(os.path.join(processed_dir, "sim_indices.npy"), sim_indices)
        print("Data split indices created.")

In [None]:
# ==========================================
# 3. Execution: Download & Split
# ==========================================
dm = Cifar10DataManager(data_dir="./data")
dm.prepare_initial_split()

In [None]:
# ==========================================
# 4. Versioning: Upload to W&B
# ==========================================
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, job_type="data_preparation", name="cifar10_v1")

dataset_artifact = wandb.Artifact(
    name="cifar10_dataset", 
    type="dataset", 
    description="CIFAR-10 Raw Data + Split Indices"
)

# Upload the entire directory (Raw Data + Indices)
dataset_artifact.add_dir("./data")

run.log_artifact(dataset_artifact)
run.finish()

print("Step 1 Complete: Dataset v1 logged.")