In [14]:
# Cell 1: Configuration Variables for LR+TF-IDF Training

PROJECT_ID = "sentiment-analysis-steam" # Replace with your actual GCP Project ID
REGION = "us-west1"                    # Must match your GCS bucket region and Vertex AI job region
BUCKET_NAME = "steam-reviews-bucket-0" # Your GCS bucket name

# Path to your cleaned data for training
DATA_URI = f"gs://{BUCKET_NAME}/steam_reviews_cleaned.csv"

# --- Specifics for your LR+TF-IDF Training Container ---
TRAINING_IMAGE_NAME = "steam-lr-review-trainer" # The name you've given to your training image
FULL_TRAINING_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{TRAINING_IMAGE_NAME}:latest"

print(f"Project ID: {PROJECT_ID}")
print(f"Region: {REGION}")
print(f"GCS Bucket: {BUCKET_NAME}")
print(f"Training Data URI: {DATA_URI}")
print(f"Training Docker Image Name: {TRAINING_IMAGE_NAME}")
print(f"Full Training Docker Image URI: {FULL_TRAINING_IMAGE_URI}")

# Set the gcloud project config (important for subsequent gcloud commands)
# This command needs to be run in a cell where you can execute shell commands.
!gcloud config set project {PROJECT_ID}
print(f"\n✅ gcloud project set to {PROJECT_ID}")

Project ID: sentiment-analysis-steam
Region: us-west1
GCS Bucket: steam-reviews-bucket-0
Training Data URI: gs://steam-reviews-bucket-0/steam_reviews_cleaned.csv
Training Docker Image Name: steam-lr-review-trainer
Full Training Docker Image URI: gcr.io/sentiment-analysis-steam/steam-lr-review-trainer:latest
Updated property [core/project].

✅ gcloud project set to sentiment-analysis-steam


In [15]:
# Cell 2: Build and Push LR+TF-IDF Training Docker Image

# IMPORTANT: Ensure your current working directory in the notebook terminal
# is the parent directory of your 'lr_tfidf_trainer' folder.
# For example, if your 'lr_tfidf_trainer' folder is directly in your
# JupyterLab root (e.g., /home/jupyter/SentimentAnalysis-Steam/),
# then your notebook's CWD should be that root.
# You can use !pwd and !ls -F to check. Use %cd to navigate if needed.

print(f"Building and pushing Docker image: {FULL_TRAINING_IMAGE_URI}...")

# This command executes Cloud Build to build your image from the Dockerfile
# located in the './lr_tfidf_trainer' directory and push it to GCR.
!gcloud builds submit --tag {FULL_TRAINING_IMAGE_URI} ./lr_tfidf_trainer

print(f"\n✅ Docker image built and pushed: {FULL_TRAINING_IMAGE_URI}")

Building and pushing Docker image: gcr.io/sentiment-analysis-steam/steam-lr-review-trainer:latest...
Creating temporary archive of 8 file(s) totalling 45.0 KiB before compression.
Uploading tarball of [./lr_tfidf_trainer] to [gs://sentiment-analysis-steam_cloudbuild/source/1751949673.286384-d80f7de4488f4c5db563c6bf2fd50467.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/sentiment-analysis-steam/locations/global/builds/91be929d-4b7e-4b8c-b9d7-c0f16b281fad].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/91be929d-4b7e-4b8c-b9d7-c0f16b281fad?project=1063155306158 ].
Waiting for build to complete. Polling interval: 1 second(s).
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "91be929d-4b7e-4b8c-b9d7-c0f16b281fad"

FETCHSOURCE
Fetching storage object: gs://sentiment-analysis-steam_cloudbuild/source/1751949673.286384-d80f7de4488f4c5db563c6bf2fd50467.tgz#1751949673490901
Copying gs://sentiment-analysis-st

In [6]:
# executes (runs) that container image as a training job on Vertex AI.
from google.cloud import aiplatform

# --- Configuration for your Training Job ---
PROJECT_ID = "sentiment-analysis-steam" # Replace with your actual project ID
REGION = "us-west1"            # Choose your preferred GCP region
BUCKET_NAME = "steam-reviews-bucket-0" # Your GCS bucket name
DATA_URI = f"gs://{BUCKET_NAME}/steam_reviews_cleaned.csv" # Path to your cleaned data
TRAINING_IMAGE_URI = f"gcr.io/{PROJECT_ID}/steam-lr-review-trainer:latest" # Image from Step 5

# --- Initialize Vertex AI SDK ---
# 'staging_bucket' is used by Vertex AI for temporary files during job execution.
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{BUCKET_NAME}/vertex_ai_staging")

# --- Define the Custom Training Job ---
job = aiplatform.CustomContainerTrainingJob(
    display_name='lr-tfidf-steam-sentiment-training',
    container_uri=TRAINING_IMAGE_URI,
    # This is the pre-built serving container for scikit-learn models.
    # It will be used if you deploy the model directly via this 'model' object later.
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest" # Use this for scikit-learn
)

print(f"Submitting training job using image: {TRAINING_IMAGE_URI}...")
model = job.run(
    replica_count=1, # Number of worker replicas (1 for single instance)
    machine_type='n1-standard-4', # A general purpose machine type. Adjust if you need more CPU/RAM.
                                   # For TF-IDF, ample RAM is often more important than raw CPU count.
                                   # Consider 'n1-standard-8' or 'n1-standard-16' if your dataset is very large.
    # No accelerator_type/count needed here as LR+TF-IDF runs on CPU
    
    # --- Pass Arguments to your task.py script ---
    args=[
        f"--project-id={PROJECT_ID}",
        f"--bucket-name={BUCKET_NAME}",
        f"--data-uri={DATA_URI}"
    ]
)

print(f"\n✅ Training job submitted! Job ID: {job.resource_name.split('/')[-1]}")
print(f"You can monitor its progress in the Google Cloud Console: Vertex AI > Training > Custom jobs")

# Once the job completes, 'model' object will contain information about the saved model artifact.
# You can inspect model.uri to see where the model was saved.
print(f"\nModel artifacts will be saved to: {model.uri} (and sub-timestamped folders)")

Submitting training job using image: gcr.io/sentiment-analysis-steam/steam-lr-review-trainer:latest...
Training Output directory:
gs://steam-reviews-bucket-0/vertex_ai_staging/aiplatform-custom-training-2025-07-08-04:02:30.354 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/5881656308247035904?project=1063155306158
CustomContainerTrainingJob projects/1063155306158/locations/us-west1/trainingPipelines/5881656308247035904 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/7351940849095147520?project=1063155306158
CustomContainerTrainingJob projects/1063155306158/locations/us-west1/trainingPipelines/5881656308247035904 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/1063155306158/locations/us-west1/trainingPipelines/5881656308247035904 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJo

RuntimeError: Training failed with:
code: 5
message: "There are no files under \"gs://steam-reviews-bucket-0/vertex_ai_staging/aiplatform-custom-training-2025-07-08-04:02:30.354/model\" to copy."
