In [1]:
# Cell 0: Navigate to Project Root
import os

# --- QUESTION ---
# Please confirm your project's root directory.
# Based on our previous discussions, this is likely:
PROJECT_ROOT_PATH = "/home/jupyter/SentimentAnalysis-Steam/" 
# ----------------

# Change the current working directory to the project root
%cd {PROJECT_ROOT_PATH}

print(f"Current working directory set to: {os.getcwd()}")
print("Listing contents of current directory to verify setup:")
!ls -F

# Expected output from !ls -F should show your service folder:
# ingestion_service/
# ... (and other project folders like lr_tfidf_trainer/, notebooks/, etc.)

/home/jupyter/SentimentAnalysis-Steam
Current working directory set to: /home/jupyter/SentimentAnalysis-Steam
Listing contents of current directory to verify setup:
Dockerfile  app/		     lr_tfidf_trainer/	src/
LICENSE     cloudrun_launcher.ipynb  models/		steam_sentiment_dbt/
Notebooks/  ingestion_app_local.log  reviews_data/
README.md   ingestion_service/	     spark_lr_service/


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# Cell 1: Global Configuration Variables
# These variables are used across various deployment steps.

# --- QUESTION ---
# Please confirm your Google Cloud Project ID.
# Based on our previous discussions, this is:
PROJECT_ID = "sentiment-analysis-steam" 
# ----------------

# --- QUESTION ---
# Please confirm the Google Cloud Region you are using for Cloud Run and your GCS bucket.
# Based on our previous discussions, this is:
REGION = "us-west1"                    
# ----------------

# --- QUESTION ---
# Please confirm the name of your GCS bucket.
# Based on our previous discussions, this is:
BUCKET_NAME = "steam-reviews-bucket-0" 
# ----------------

# --- QUESTION ---
# Please confirm the exact file path for your *cleaned* data CSV within your GCS bucket.
# This will be used later in the pipeline for training, but defined here as a common variable.
# Based on our previous discussions, this is:
GCS_DATA_FILE_PATH = "steam_reviews_cleaned.csv" 
# ----------------

# --- QUESTION ---
# Please confirm your BigQuery Project ID (often the same as your PROJECT_ID).
# Based on our previous discussions, this is:
BQ_PROJECT_ID = PROJECT_ID             
# ----------------

# --- QUESTION ---
# Please confirm the BigQuery Dataset ID where your raw data will be stored.
# Based on our previous discussions, this is:
BQ_DATASET_ID = "steam_reviews"        
# ----------------

# --- QUESTION ---
# Please confirm the BigQuery Table ID for your raw ingested data.
# Based on our previous discussions, this is:
BQ_RAW_TABLE_ID = "raw_reviews"        
# ----------------

# Set the gcloud project config for subsequent shell commands
!gcloud config set project {PROJECT_ID}

print(f"Global Configuration Set:")
print(f"  Project ID: {PROJECT_ID}")
print(f"  Region: {REGION}")
print(f"  GCS Bucket: {BUCKET_NAME}")
print(f"  GCS Clean Data Path: {GCS_DATA_FILE_PATH}")
print(f"  BigQuery Project: {BQ_PROJECT_ID}, Dataset: {BQ_DATASET_ID}, Raw Table: {BQ_RAW_TABLE_ID}")
print(f"\n✅ gcloud project set to {PROJECT_ID}")

Updated property [core/project].
Global Configuration Set:
  Project ID: sentiment-analysis-steam
  Region: us-west1
  GCS Bucket: steam-reviews-bucket-0
  GCS Clean Data Path: steam_reviews_cleaned.csv
  BigQuery Project: sentiment-analysis-steam, Dataset: steam_reviews, Raw Table: raw_reviews

✅ gcloud project set to sentiment-analysis-steam


## Data Ingestion Service

In [3]:
# Cell 2: Ingestion Service Configuration Variables

# --- QUESTION ---
# Please confirm the desired name for your Data Ingestion Docker image and Cloud Run service.
# Based on our previous discussions, this is:
INGESTION_IMAGE_NAME = "steam-review-ingestion-service"
# ----------------

FULL_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{INGESTION_IMAGE_NAME}:latest"

print(f"Ingestion Service Configuration Set:")
print(f"  Image Name: {INGESTION_IMAGE_NAME}")
print(f"  Full Image URI: {FULL_IMAGE_URI}")

Ingestion Service Configuration Set:
  Image Name: steam-review-ingestion-service
  Full Image URI: gcr.io/sentiment-analysis-steam/steam-review-ingestion-service:latest


In [32]:
# Cell 3: Build and Push Ingestion Docker Image

print(f"Building and pushing Docker image: {FULL_IMAGE_URI}...")

# This command executes Cloud Build to build your image from the Dockerfile
# located in the './ingestion_service' directory (relative to your project root).
# Ensure your 'ingestion_service' folder is present in your project root as confirmed in Cell 0.
!gcloud builds submit --tag {FULL_IMAGE_URI} ./ingestion_service

print(f"\n✅ Docker image built and pushed: {FULL_IMAGE_URI}")

Building and pushing Docker image: gcr.io/sentiment-analysis-steam/steam-review-ingestion-service:latest...
Creating temporary archive of 8 file(s) totalling 56.5 KiB before compression.
Uploading tarball of [./ingestion_service] to [gs://sentiment-analysis-steam_cloudbuild/source/1752014033.938358-7662153a19d14bb2a938fcf2679c6174.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/sentiment-analysis-steam/locations/global/builds/5597a822-728e-4b82-858d-2ab31a147b9d].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/5597a822-728e-4b82-858d-2ab31a147b9d?project=1063155306158 ].
Waiting for build to complete. Polling interval: 1 second(s).
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "5597a822-728e-4b82-858d-2ab31a147b9d"

FETCHSOURCE
Fetching storage object: gs://sentiment-analysis-steam_cloudbuild/source/1752014033.938358-7662153a19d14bb2a938fcf2679c6174.tgz#1752014034158443
Copying gs://sentiment-ana

In [33]:
# Cell 4: Deploy Ingestion Cloud Run Service

import requests # Required for health check
import time # Required for sleep

print(f"Deploying Cloud Run service: {INGESTION_IMAGE_NAME} to region {REGION}...")

# IMPORTANT: Permissions for Cloud Run Service Account
# By default, Cloud Run uses the Compute Engine default service account for your project.
# Ensure this service account (YOUR_PROJECT_NUMBER-compute@developer.gserviceaccount.com) has:
# - Storage Object Admin role on your GCS_BUCKET_NAME
# - BigQuery Data Editor role on your BQ_PROJECT_ID (or specific dataset/table)
# - BigQuery Job User role on your BQ_PROJECT_ID

# IMPORTANT: Authentication for Cloud Run
# --allow-unauthenticated makes the service publicly accessible (e.g., for Cloud Scheduler HTTP target).
# For production, you'd typically remove this and configure Cloud Scheduler with OIDC authentication for security.

command = (
    "gcloud run deploy {} "
    "--image {} "
    "--region {} "
    "--platform managed "
    "--allow-unauthenticated " # Allows unauthenticated access
    "--set-env-vars GCS_BUCKET_NAME={},GCS_DATA_FILE_PATH={},BQ_PROJECT_ID={},BQ_DATASET_ID={},BQ_RAW_TABLE_ID={},REGION={} " # Pass REGION as env var
    "--memory 2Gi " # Adjust memory as needed
    "--timeout 900s"  # Increase timeout if startup takes longer (e.g., if many external API calls at startup)
).format(
    INGESTION_IMAGE_NAME,
    FULL_IMAGE_URI,
    REGION,
    BUCKET_NAME, GCS_DATA_FILE_PATH, BQ_PROJECT_ID, BQ_DATASET_ID, BQ_RAW_TABLE_ID, REGION # Pass REGION here
)

get_ipython().system(command)

print(f"\n✅ Cloud Run service '{INGESTION_IMAGE_NAME}' deployed.")
print("You can find its URL in the Cloud Run console or by running:")
print(f"gcloud run services describe {INGESTION_IMAGE_NAME} --region {REGION} --platform managed --format='value(status.url)'")

# --- Get the service URL reliably into a Python variable and perform a quick health check ---
# This part is modified to make CLOUD_RUN_SERVICE_URL available for the next cells.

CLOUD_RUN_SERVICE_URL = "" # Initialize variable

try:
    print("\nAttempting to get Cloud Run service URL and perform health check...")
    # Give Cloud Run a brief moment to ensure the URL is propagated/service is fully ready
    # Even after 'Deploying... Done.', a small delay can prevent 'URL not found' errors.
    time.sleep(10) # 10 seconds is usually enough

    # Use gcloud command to describe the service and extract its URL
    url_cmd_output = !gcloud run services describe {INGESTION_IMAGE_NAME} --region {REGION} --platform managed --format='value(status.url)'
    
    # Extract the URL from the command output
    CLOUD_RUN_SERVICE_URL = url_cmd_output[0].strip() if url_cmd_output else "URL_NOT_FOUND"
    
    print(f"Cloud Run Service URL: {CLOUD_RUN_SERVICE_URL}")

    # Perform a quick health check using the obtained URL
    if CLOUD_RUN_SERVICE_URL and CLOUD_RUN_SERVICE_URL != "URL_NOT_FOUND":
        # Send a GET request to the root path (/) for health check.
        # Your ingestion_app.py's root route only accepts POST.
        # However, for a general health check, Cloud Run often checks GET.
        # If your app needs a specific /healthz endpoint, you'd call that.
        # For now, a GET to root will likely result in a 405, but confirms connectivity.
        health_check_response = requests.get(CLOUD_RUN_SERVICE_URL)
        
        if health_check_response.status_code == 200:
            print("✅ Cloud Run Service Health Check: Responding OK (Status 200).")
        elif health_check_response.status_code == 405: # <-- CORRECTED LINE
            print(f"✅ Cloud Run Service Health Check: Responding 405 (Method Not Allowed). This is expected as / only accepts POST.")
        else:
            print(f"⚠️ Cloud Run Service Health Check: Status {health_check_response.status_code}. Check Cloud Run logs for details.")
    else:
        print("❌ Cloud Run Service URL not found. Cannot perform health check.")

except Exception as e:
    print(f"❌ Failed to get Cloud Run service URL or perform health check: {e}")

Deploying Cloud Run service: steam-review-ingestion-service to region us-west1...
Deploying container to Cloud Run service [[1msteam-review-ingestion-service[m] in project [[1msentiment-analysis-steam[m] region [[1mus-west1[m]
Deploying...                                                                   
  . Creating Revision...                                                       
  . Routing traffic...                                                         
  . Setting IAM Policy...                                                      
  Deploying...                                                                 



⠛ Deploying...                                                                 



⠹ Deploying...                                                                 



⠼ Deploying...                                                                 



⠶ Deploying...                                                                 



⠧ Deploying...                  

In [34]:
# Test your ingestion cloud run service before incorporating cloud scheduler.
# Check your BQ and Bucket to see if there is new data after running this cell.
!curl -X POST https://steam-review-ingestion-service-1063155306158.us-west1.run.app -H "Content-Type: application/json" -d "{}"

{"message":"Raw data ingested to BigQuery.","status":"success"}


In [42]:
# Cell 5: Create Cloud Scheduler Job (Updated with your specified URL)
import json

# --- Get the URL of your deployed Cloud Run service ---
# It should have been set in the previous cell (Cell 4).
# Explicitly setting the URL provided by the user.
CLOUD_RUN_SERVICE_URL = "https://steam-review-ingestion-service-jrwoiw7htq-uw.a.run.app"

print(f"Using Cloud Run Service URL: {CLOUD_RUN_SERVICE_URL}")


# Define the job name and schedule
SCHEDULER_JOB_NAME = "daily-steam-reviews-raw-ingestion"
SCHEDULER_SCHEDULE = "0 2 * * *" # Cron format: Every day at 2 AM (UTC)
                                 # For testing, you could use "every 1 hour" (without quotes)
SCHEDULER_TIME_ZONE = "America/Los_Angeles" # Or your desired timezone

print(f"Creating Cloud Scheduler job: {SCHEDULER_JOB_NAME} to trigger {CLOUD_RUN_SERVICE_URL}...")

# Build the complete gcloud command string using .format()
command = (
    "gcloud scheduler jobs create http {} "
    "--location {} "
    "--schedule \"{}\" " # Schedule needs to be quoted for cron string
    "--uri \"{}\" "     # URI also quoted as it contains special chars like ://
    "--http-method POST "
    "--headers \"Content-Type=application/json\" " # Headers need to be quoted
    "--message-body \"{}\" " # Message body needs to be quoted
    "--time-zone \"{}\"" # Time zone needs to be quoted
).format(
    SCHEDULER_JOB_NAME,
    REGION,
    SCHEDULER_SCHEDULE,
    CLOUD_RUN_SERVICE_URL,
    json.dumps({}), # Empty JSON body, properly quoted
    SCHEDULER_TIME_ZONE
)

# Execute the command using get_ipython().system() for robust execution
get_ipython().system(command)

print(f"\n✅ Cloud Scheduler job '{SCHEDULER_JOB_NAME}' created.")
print("You can view, manually run, and manage it in the Google Cloud Console: Cloud Scheduler.")

Using Cloud Run Service URL: https://steam-review-ingestion-service-jrwoiw7htq-uw.a.run.app
Creating Cloud Scheduler job: daily-steam-reviews-raw-ingestion to trigger https://steam-review-ingestion-service-jrwoiw7htq-uw.a.run.app...
attemptDeadline: 180s
httpTarget:
  body: e30=
  headers:
    Content-Type: application/json
    User-Agent: Google-Cloud-Scheduler
  httpMethod: POST
  uri: https://steam-review-ingestion-service-jrwoiw7htq-uw.a.run.app/
name: projects/sentiment-analysis-steam/locations/us-west1/jobs/daily-steam-reviews-raw-ingestion
retryConfig:
  maxBackoffDuration: 3600s
  maxDoublings: 5
  maxRetryDuration: 0s
  minBackoffDuration: 5s
schedule: 0 2 * * *
state: ENABLED
timeZone: America/Los_Angeles
userUpdateTime: '2025-07-08T23:37:58Z'

✅ Cloud Scheduler job 'daily-steam-reviews-raw-ingestion' created.
You can view, manually run, and manage it in the Google Cloud Console: Cloud Scheduler.


## dbt Data Transformation

In [45]:
# Step 1: One-Time Load of Kaggle Data to BigQuery (kaggle_historical_raw)
# This step gets your historical Kaggle dataset into BigQuery so your dbt models can access it alongside your new API data. 
# RUN THIS CELL ONCE ONLY!

# Cell: Load Kaggle Data to BigQuery (One-Time Run)
import json
from google.cloud import bigquery
from google.api_core.exceptions import NotFound # Import for error handling

# Global variables: PROJECT_ID, BUCKET_NAME, BQ_PROJECT_ID, BQ_DATASET_ID are assumed from Cell 1

KAGGLE_TABLE_ID = "kaggle_historical_raw" # New table name for Kaggle data in BigQuery
# This URI MUST point to where you've uploaded your 'steam_reviews_kaggle_cleaned.csv' file
GCS_KAGGLE_CSV_URI = f"gs://{BUCKET_NAME}/processed_data/historical_kaggle/steam_reviews_kaggle_cleaned.csv" 

bq_client = bigquery.Client(project=BQ_PROJECT_ID)

# --- Define schema for Kaggle data. IMPORTANT: This MUST accurately match your CSV's columns and types. ---
# Add all columns present in your 'steam_reviews_kaggle_cleaned.csv' here.
# If your Kaggle CSV doesn't have a 'recommendationid', one will be generated by dbt later.
# If it has other IDs/timestamps that could be unique, include them for better deduplication.
kaggle_schema = [
    bigquery.SchemaField("app_id", "INT64", mode="NULLABLE"),
    bigquery.SchemaField("app_name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("review_text", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("review_score", "INT64", mode="NULLABLE"), # Assuming 1 or 0
    bigquery.SchemaField("review_votes", "INT64", mode="NULLABLE"), # Assuming count of votes
    # If your CSV has more columns that weren't visible in the screenshot, add them here.
]

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1, # Skips the header row in your CSV
    autodetect=False,    # Use explicit schema defined above
    schema=kaggle_schema,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, # Overwrite table if it exists (safe for one-time load)
)

# First, ensure the dataset exists (as ingestion_app.py does not create it globally)
dataset_ref = bq_client.dataset(BQ_DATASET_ID)
dataset = bigquery.Dataset(dataset_ref)
dataset.location = REGION # Use your project's region for dataset location
try:
    bq_client.get_dataset(dataset_ref)
    print(f"BigQuery Dataset {BQ_DATASET_ID} already exists.")
except NotFound:
    bq_client.create_dataset(dataset)
    print(f"Created BigQuery Dataset {BQ_DATASET_ID}.")
except Exception as e:
    print(f"ERROR: Failed to create/get BigQuery Dataset {BQ_DATASET_ID}: {e}")
    raise # Critical failure if dataset cannot be prepared

# Now load the table
load_job = bq_client.load_table_from_uri(
    GCS_KAGGLE_CSV_URI,
    f"{BQ_PROJECT_ID}.{BQ_DATASET_ID}.{KAGGLE_TABLE_ID}",
    job_config=job_config
)

print(f"Loading Kaggle data from {GCS_KAGGLE_CSV_URI} into {BQ_DATASET_ID}.{KAGGLE_TABLE_ID}...")
load_job.result() # Wait for the job to complete

print(f"✅ Loaded {load_job.output_rows} rows into {BQ_DATASET_ID}.{KAGGLE_TABLE_ID}.")

BigQuery Dataset steam_reviews already exists.
Loading Kaggle data from gs://steam-reviews-bucket-0/processed_data/historical_kaggle/steam_reviews_kaggle_cleaned.csv into steam_reviews.kaggle_historical_raw...
✅ Loaded 6226728 rows into steam_reviews.kaggle_historical_raw.


In [49]:
# Cell: Build and Push dbt Runner Docker Image after setting up all files in dbt_project folder
# (Assumes PROJECT_ID is defined in Cell 1)

DBT_IMAGE_NAME = "dbt-steam-reviews-runner"
FULL_DBT_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{DBT_IMAGE_NAME}:latest"

print(f"Building and pushing dbt Runner Docker image: {FULL_DBT_IMAGE_URI}...")
# This command executes Cloud Build to build your image from the Dockerfile
# located in the './dbt_project' directory (relative to your project root).
!gcloud builds submit --tag {FULL_DBT_IMAGE_URI} ./dbt_project
print(f"\n✅ dbt Runner Docker image built and pushed: {FULL_DBT_IMAGE_URI}")

Building and pushing dbt Runner Docker image: gcr.io/sentiment-analysis-steam/dbt-steam-reviews-runner:latest...
Creating temporary archive of 16 file(s) totalling 14.7 KiB before compression.
Uploading tarball of [./dbt_project] to [gs://sentiment-analysis-steam_cloudbuild/source/1752026823.403737-b0ebb149e9564f9fa4a7645e8fd0a4d5.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/sentiment-analysis-steam/locations/global/builds/fc005f85-fdb3-48e0-bf5a-9e4022e7e1bf].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/fc005f85-fdb3-48e0-bf5a-9e4022e7e1bf?project=1063155306158 ].
Waiting for build to complete. Polling interval: 1 second(s).
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "fc005f85-fdb3-48e0-bf5a-9e4022e7e1bf"

FETCHSOURCE
Fetching storage object: gs://sentiment-analysis-steam_cloudbuild/source/1752026823.403737-b0ebb149e9564f9fa4a7645e8fd0a4d5.tgz#1752026823572493
Copying gs://sentiment-ana

In [51]:
# Cell: Trigger Cloud Build for dbt (Manual Test Run)
# (Assumes PROJECT_ID is defined in Cell 1)

print("Triggering Cloud Build job to run dbt and export cleaned data...")
# This command uses the 'cloudbuild.yaml' file in your current directory (project root).
!gcloud builds submit --config cloudbuild.yaml .

print("\n✅ Cloud Build job submitted. Check Cloud Build console for status.")
print(f"You can monitor it at: https://console.cloud.google.com/cloud-build/builds?project={PROJECT_ID}")

Triggering Cloud Build job to run dbt and export cleaned data...
Creating temporary archive of 91 file(s) totalling 132.4 MiB before compression.
Some files were not included in the source upload.

Check the gcloud log [/home/jupyter/.config/gcloud/logs/2025.07.09/02.15.41.658513.log] to see which files and the contents of the
default gcloudignore file used (see `$ gcloud topic gcloudignore` to learn
more).

Uploading tarball of [.] to [gs://sentiment-analysis-steam_cloudbuild/source/1752027341.823425-79552f7f2ec64ab88f0dcf7c8da9874a.tgz]
[1;31mERROR:[0m (gcloud.builds.submit) INVALID_ARGUMENT: generic::invalid_argument: invalid value for 'build.substitutions': key in the template "BQ_DATASET_ID" is not a valid built-in substitution

✅ Cloud Build job submitted. Check Cloud Build console for status.
You can monitor it at: https://console.cloud.google.com/cloud-build/builds?project=sentiment-analysis-steam
