# 04 — Vertex AutoML Baseline
Train an AutoML Image Classification model on Vertex AI as a zero-effort baseline.

In [None]:
import os, subprocess, sys

# Clone repo (skip if already cloned)
if not os.path.exists("/content/NST_Class"):
    subprocess.run(["git", "clone", "https://github.com/AayushBaniya2006/NST_Class.git"], cwd="/content")
os.chdir("/content/NST_Class")

!pip install -q -r requirements.txt

sys.path.insert(0, '/content/NST_Class')

print("Setup complete! Make sure you ran notebook 01 first.")

In [None]:
import pandas as pd
from google.cloud import aiplatform
from src.utils.gcs import upload_directory_to_gcs, upload_file_to_gcs, generate_automl_csv

In [None]:
# Configuration — FILL THESE IN
PROJECT_ID = "YOUR_PROJECT_ID"  # Replace with your GCP project
REGION = "us-central1"
BUCKET_NAME = "skin-tone-project"  # Replace with your bucket name
GCS_IMAGE_PREFIX = f"gs://{BUCKET_NAME}/images"

aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
# Upload images to GCS (only run once!)
# upload_directory_to_gcs("data/images", BUCKET_NAME, "images")
# print("Images uploaded to GCS")

In [None]:
# Prepare AutoML manifest
train_df = pd.read_csv("data/cleaned/train.csv")
val_df = pd.read_csv("data/cleaned/val.csv")
test_df = pd.read_csv("data/cleaned/test.csv")

train_df["split"] = "train"
val_df["split"] = "val"
test_df["split"] = "test"

full_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

manifest_path = generate_automl_csv(
    full_df,
    image_gcs_prefix=GCS_IMAGE_PREFIX,
    output_path="data/automl_manifest.csv",
)
print(f"Manifest created at {manifest_path}")

In [None]:
# Upload manifest to GCS
manifest_gcs_uri = upload_file_to_gcs(
    manifest_path,
    BUCKET_NAME,
    "automl/manifest.csv",
)
print(f"Manifest uploaded to {manifest_gcs_uri}")

In [None]:
# Create Vertex AI Dataset
dataset = aiplatform.ImageDataset.create(
    display_name="fitzpatrick17k-skin-tone",
    gcs_source=manifest_gcs_uri,
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.single_label_classification,
)
print(f"Dataset created: {dataset.resource_name}")

In [None]:
# Train AutoML model
job = aiplatform.AutoMLImageTrainingJob(
    display_name="skin-tone-automl-baseline",
    prediction_type="classification",
    multi_label=False,
    model_type="CLOUD",
    base_model=None,
)

# Note: Do NOT pass training/validation/test_fraction_split here.
# The manifest CSV already has ML_USE column (TRAINING/VALIDATION/TEST)
# from our stratified splits. Passing explicit fractions would override those.
model = job.run(
    dataset=dataset,
    model_display_name="skin-tone-automl-v1",
    budget_milli_node_hours=8000,
)
print(f"Model trained: {model.resource_name}")

In [None]:
# Evaluate AutoML model
model_eval = model.list_model_evaluations()[0]
print("\nAutoML Evaluation Metrics:")
print(f"  Model: {model.display_name}")

metrics = model_eval.metrics
for key, value in metrics.items():
    print(f"  {key}: {value}")

In [None]:
# Extract per-class metrics for fairness comparison
print("\nPer-class metrics (copy these to notebook 03):")
if "confusionMatrix" in metrics:
    cm = metrics["confusionMatrix"]
    print(cm)

In [None]:
# Model ID for reference
print(f"\nVertex Model ID: {model.resource_name}")
print("Use this ID in notebook 03 for comparison.")