## Local Training & Experiments

In [46]:
!pip3 install xgboost google-cloud-aiplatform scikit-learn pandas-gbq joblib



In [1]:
import os
import re
import uuid

import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import tqdm
import xgboost as xgb
from google.cloud import aiplatform, bigquery
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler

In [24]:
PROJECT_ID = "planar-pagoda-425919-f0"
LOCATION = ""
DATASET_URL = "ml.scam_features"
MODEL_DIR = "./model"
EXPERIMENT_NAME = "local-dev"

In [25]:
# aiplatform.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)
bq_client = bigquery.Client(project=PROJECT_ID)

In [26]:
# get data from dataset
def download_table(bq_table_uri: str):
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]
    table = bq_client.get_table(bq_table_uri)
    return bq_client.list_rows(table).to_dataframe()


df = download_table(DATASET_URL)

Forbidden: 403 GET https://bigquery.googleapis.com/bigquery/v2/projects/planar-pagoda-425919-f0/datasets/ml/tables/scam_features?prettyPrint=false: Access Denied: Table planar-pagoda-425919-f0:ml.scam_features: Permission bigquery.tables.get denied on table planar-pagoda-425919-f0:ml.scam_features (or it may not exist).

In [58]:
X = df.drop("is_scam", axis=1)
y = df["is_scam"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
aiplatform.start_run(f"run{uuid.uuid1()}")

params = {"class_weight": "balanced", "max_iter": 1000}

model = LogisticRegression(**params)
model.fit(X_train_scaled, y_train)

aiplatform.log_params(params)

y_pred = model.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

metrics = {"f1": f1, "recall": recall, "roc_auc": roc_auc}
print(metrics)
aiplatform.log_metrics(metrics)

os.makedirs(MODEL_DIR, exist_ok=True)
model_path = f"{MODEL_DIR}/model.joblib"
with open(model_path, "wb") as f:
    joblib.dump(model, f)

aiplatform.save_model(model=model)

aiplatform.end_run()

## Submit job service

In [92]:
shell_output = ! gcloud projects describe  $PROJECT_ID
print("shell_output=", shell_output)
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")

SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

print("Service Account:", SERVICE_ACCOUNT)

shell_output= ["createTime: '2024-06-09T19:03:35.409500Z'", 'lifecycleState: ACTIVE', 'name: My First Project', 'projectId: planar-pagoda-425919-f0', "projectNumber: '475961067334'"]
Service Account: 475961067334-compute@developer.gserviceaccount.com


In [93]:
BUCKET_URI = f"gs://mlops-{PROJECT_ID}-unique"

In [96]:
! gsutil mb $BUCKET_URI

Creating gs://mlops-planar-pagoda-425919-f0-unique/...


In [127]:
! tar cvf trainer.tar .
! gzip trainer.tar
! gsutil cp trainer.tar.gz $BUCKET_URI/trainer.tar.gz

./
./README.md
./Dockerfile
./setup.cfg
./.ipynb_checkpoints/
./.ipynb_checkpoints/local_training-checkpoint.ipynb
./.ipynb_checkpoints/setup-checkpoint.py
tar: ./trainer.tar: file is the archive; not dumped
./trainer/
./trainer/__init__.py
./trainer/.ipynb_checkpoints/
./trainer/.ipynb_checkpoints/task-checkpoint.py
./trainer/task.py
./setup.py
./local_training.ipynb
Copying file://trainer.tar.gz [Content-Type=application/x-tar]...
/ [1 files][ 18.7 KiB/ 18.7 KiB]                                                
Operation completed over 1 objects/18.7 KiB.                                     


In [128]:
aiplatform.init(
    project=PROJECT_ID,
    location=LOCATION,
    experiment=EXPERIMENT_NAME,
    staging_bucket=BUCKET_URI,
)

REGION = "us-central1"
DISPLAY_NAME = "example"
TRAIN_IMAGE = "{}-docker.pkg.dev/vertex-ai/training/tf-cpu.2-12.py310:latest".format(
    REGION.split("-")[0]
)
TRAIN_COMPUTE = "n1-standard-4"

job = aiplatform.CustomPythonPackageTrainingJob(
    display_name=DISPLAY_NAME,
    python_package_gcs_uri=f"{BUCKET_URI}/trainer.tar.gz",
    python_module_name="trainer.task",
    container_uri=TRAIN_IMAGE,
)

In [129]:
EXPERIMENT_NAME = f"example-{uuid.uuid1()}"
aiplatform.init(experiment=EXPERIMENT_NAME)

CMDARGS = [
    "--model-dir=" + BUCKET_URI,
    "--dataset-uri=bq://planar-pagoda-425919-f0.ml.scam_features",
    f"--experiment={EXPERIMENT_NAME}",
    "--run=run-1",
]

job.run(
    args=CMDARGS,
    replica_count=1,
    machine_type=TRAIN_COMPUTE,
    service_account=SERVICE_ACCOUNT,
    sync=True,
)

Training Output directory:
gs://mlops-planar-pagoda-425919-f0-unique/aiplatform-custom-training-2024-06-11-23:26:54.624 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/5503252818837897216?project=475961067334
CustomPythonPackageTrainingJob projects/475961067334/locations/us-central1/trainingPipelines/5503252818837897216 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1038197671090716672?project=475961067334
CustomPythonPackageTrainingJob projects/475961067334/locations/us-central1/trainingPipelines/5503252818837897216 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/475961067334/locations/us-central1/trainingPipelines/5503252818837897216 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/475961067334/locations/us-central1/trainingPipelines/55032528188378972