# Train XGBoost

In the project folder create a trainer folder

In [11]:
! mkdir trainer

mkdir: trainer: File exists


# Python fine tuning script

Then create in trainer your task.py fil in which we want to fine tune hyperparameters for XGBoost

In [12]:
%%writefile trainer/train.py
#first line creates the file in the trainer folder

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy
import configparser
from google.cloud import storage
import sys
sys.path.append("..")
import tensorflow as tf
from tensorflow.keras.datasets import boston_housing


JOB_NAME  = "house-pricing-train-job"
PROJECT   = "trans-sunset-439207-f2"
LOCATION  = "us-central1"
BUCKET_ID = "trans-sunset-439207-f2-house-pricing"

parser = argparse.ArgumentParser()
parser.add_argument('--l_r', dest='lr', default=0.001, type=float, help='learning rate')
parser.add_argument('--n_estimators', dest='n_estimators', default=100, type=int, help='n_estimators')
parser.add_argument( '--max_depth',dest='max_depth',  default=6, type=int, help='max_depth')
parser.add_argument( '--subsample', dest='subsample', default=0.8, type=float, help='subsample')

args = parser.parse_args()


def save_model_to_bucket(model_trained : XGBRegressor):
    """Save the trained model to the bucket

    Args:
        model_trained (XGBRegressor): 
            trained model
    """
    
    # save it to the bucket
    bst = model_trained.get_booster()
    bst.save_model('/tmp/xgboost_model.bst')
    storage_client = storage.Client()

    bucket = storage_client.bucket(BUCKET_ID)
    blob = bucket.blob("xgboost-model/model_xgboost.bst")

    # Uploader le fichier local vers le blob
    blob.upload_from_filename('/tmp/xgboost_model.bst')
    


def create_xgboost(n_estimators: int, 
                   max_depth: int, 
                   learning_rate: float, 
                   subsample: float):
    """
    create the xgboost regressor model with hyperparameters
    
    Args:
        n_estimators (int):
        
        max_depth (int):
        
        learning_rate (float):
        
        subsample (float):
        
        
    Returns:
        xgb (XGBRegressor):
            model init with hyperparameters

    """
    xgb = XGBRegressor(n_estimators= n_estimators, 
                       max_depth=max_depth, 
                       learning_rate=learning_rate, 
                       subsample=subsample)
    return xgb




def train_xgboost(x_train: pd.DataFrame, x_val: pd.DataFrame, y_train: pd.Series, y_val: pd.Series):
    """
        train and then save the xgboost model

        Args:
            x_train (pd.DataFrame):
                    training dataset
                    
            x_val (pd.DataFrame):
                validation dataset
                
            y_train (pd.Series): 
                label for training dataset
                
            y_val (pd.Series):
                label for validation dataset

        Returns:
            XGBRegressor: 
                trained model
        
        Raise:
        ------
            - ValueType Error
            - if input are not numpy.array
    """
    
    if not isinstance(x_train, pd.DataFrame) or not isinstance(x_val, pd.DataFrame) or not isinstance(y_train, pd.Series) or not isinstance(y_val, pd.Series):
        raise TypeError(f"Wrong type for data, expected pd.DataFrame or pd.Series got x_train: {type(x_train).__name__}, "
                        f"x_val: {type(x_val).__name__}, y_train: {type(y_train).__name__}, "
                        f"y_val: {type(y_val).__name__}")
        
        
    # get params from bucket
    l_r, n_estimators, max_d, subsamples = get_hyperparams_xgboost()
    
    xgb = create_xgboost(n_estimators, max_d, l_r, subsamples) 
    xgb.fit(x_train, y_train)
    pred = xgb.predict(x_val)
    score = r2_score(y_val, pred)
    
    return xgb

def load_data():
    
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data(
        path="boston_housing.npz", test_split=0.2, seed=113
    )

    columns = [
        "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", 
        "TAX", "PTRATIO", "B", "LSTAT"
    ]

    # Convertir les ensembles de données en DataFrames pandas
    return pd.DataFrame(x_train, columns=columns), pd.Series(y_train), pd.DataFrame(x_test, columns=columns), pd.Series(y_test)


with strategy.scope():
  # Creation of dataset, and model building/compiling need to be within
  # `strategy.scope()`.
  xgboost = create_xgboost(args.n_estimators, args.max_depth, args.l_r, args.subsample)
    
x_train, y_train, x_val, y_val = load_data()

# train and save the new model
xgb_trained = train_xgboost(x_train, x_val, y_train, y_val)

save_model_to_bucket(xgb_trained)
    




Overwriting trainer/train.py


# Dockerfile

Then we will need to create a Dockerfile in order to create the container and send it to Google Cloud (Artifact Registry)

In [13]:
%%writefile Dockerfile

FROM python:3.9-slim

# Avoiding crash due to scikit-learn and xgboost installation 
RUN apt-get update && apt-get install -y \
    build-essential \
    libatlas-base-dev \
    && rm -rf /var/lib/apt/lists/*

ENV APP_HOME /app
WORKDIR $APP_HOME
COPY . ./

RUN pip install -r requirements.txt
CMD ["python", "trainer/train.py"]

Overwriting Dockerfile


# Google Cloud 

### auth and connect to your project

In [2]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=H7u8cIqhI06fBpw0VQPU152LGQ7qg7&access_type=offline&code_challenge=ZwxP9WzyfcHfdRHrysLZC5EM3KEdsYnWUJJrpc8_r6o&code_challenge_method=S256


Credentials saved to file: [/Users/avicenne/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "trans-sunset-439207-f2" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


In [3]:
PROJECT_ID = "trans-sunset-439207-f2"
BUCKET_URI = f"gs://{PROJECT_ID}-boston-house-pricing" 
LOCATION = "us-central1"
IMAGE_NAME = "xgboost-train"
IMAGE_URI = f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}"

In [4]:
# select the project
!gcloud config set project $PROJECT_ID

# check if bucket already exist
!gsutil ls gs://dist-$PROJECT_ID-unique

#if not create it
! gsutil mb -l $LOCATION $BUCKET_URI

Updated property [core/project].
gs://dist-trans-sunset-439207-f2-unique/xgboost/
Creating gs://trans-sunset-439207-f2-boston-house-pricing/...
ServiceException: 409 A Cloud Storage bucket named 'trans-sunset-439207-f2-boston-house-pricing' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


### Docker Build

In [5]:
!docker build ./ -t $IMAGE_URI

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                    docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 368B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9-slim         0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 368B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9-slim         0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.5s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile     

In [8]:
!docker push $IMAGE_URI

Using default tag: latest
The push refers to repository [gcr.io/trans-sunset-439207-f2/xgboost-train]

[1B73343c28: Preparing 
[1B68cd70bc: Preparing 
[1B93120a04: Preparing 
[1B52a835b1: Preparing 
[1B044002b3: Preparing 
[1B64a1b270: Preparing 
[1B574d28ce: Preparing 
[3B64a1b270: Waiting g denied: Unauthenticated request. Unauthenticated requests do not have permission "artifactregistry.repositories.uploadArtifacts" on resource "projects/trans-sunset-439207-f2/locations/us/repositories/gcr.io" (or it may not exist)


### Vertex Ai training

In [5]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt
import vertexai.preview

In [9]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
        },
        "replica_count": 1,
        "container_spec": {"image_uri": IMAGE_URI},
    }
]


# Create a CustomJob

JOB_NAME = "xgboost-train-job"

train_job = aiplatform.CustomJob(
    display_name=IMAGE_NAME,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=BUCKET_URI,
)



In [None]:
# Load the hyperparams from the previous Xgboost CustomJob
JOB_NAME = "house-pricing-hyperparam-job"

hpt_job = aiplatform.HyperparameterTuningJob.get(
        resource_name=JOB_NAME,
    )

"""CMDARGS = [
        "--l_r=" + ,
        "--n_estimators=" + ,
        "--max_depth=" + ,
        "--subsample=" + ,
    ]"""
    

In [10]:
train_job.run()

Creating CustomJob


ResourceExhausted: 429 The following quota metrics exceed quota limits: aiplatform.googleapis.com/custom_model_training_cpus

In [13]:
# create an endpoint 
# The endpoint resource provides the service endpoint (URL) you use to request for the prediction
endpoint = aiplatform.Endpoint.create(
  display_name="xgboost_model",
  dedicated_endpoint_enabled=True,
)

Creating Endpoint
Create Endpoint backing LRO: projects/703919146685/locations/us-central1/endpoints/5957366205034528768/operations/4765707428312383488
Endpoint created. Resource name: projects/703919146685/locations/us-central1/endpoints/5957366205034528768
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/703919146685/locations/us-central1/endpoints/5957366205034528768')


In [14]:
!gcloud endpoints services list

Listed 0 items.


In [15]:
# deploy model
# When you deploy a model to an endpoint, you associate physical (machine) resources with that model so 
# it can serve online predictions. Online predictions have low latency requirements. Providing resources to the model in advance reduces latency.

aiplatform.init(project=PROJECT_ID, location=LOCATION)

# model doesn't exist yet because CPU quotas error when I start a CustomJob
model = aiplatform.Model(model_name=IMAGE_NAME)

model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=IMAGE_NAME + "-deploy",
    traffic_percentage="0.2",
    traffic_split="0.2",
    machine_type="n1-standard-4",
)


NotFound: 404 The Model does not exist.