# XGBoost

# Create project fine tuning hyperparameters

In the project folder create a trainer folder

In [None]:
! mkdir trainer

# Python fine tuning script

Then create in trainer your task.py fil in which we want to fine tune hyperparameters for XGBoost

In [1]:
%%writefile trainer/task.py
#first line creates the file in the trainer folder

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.datasets import boston_housing
import argparse
import hypertune

def get_args_xgboost():
    """
        Function that will takes params from VertexAi configuration

        Returns:
            args (ArgumentParser): 
                corresponding params for xgboost
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--learning_rate', required=True, type=float, help='learning rate')
    parser.add_argument(
        '--n_estimators', required=True, type=int, help='n_estimators')
    parser.add_argument(
        '--max_depth', required=True, type=int, help='max_depth')
    parser.add_argument(
        '--subsample', required=True, type=float, help='subsample')
    args = parser.parse_args()
    
    return args


def load_data():
    
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data(
        path="boston_housing.npz", test_split=0.2, seed=113
    )

    columns = [
        "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", 
        "TAX", "PTRATIO", "B", "LSTAT"
    ]

    # Convertir les ensembles de données en DataFrames pandas
    return pd.DataFrame(x_train, columns=columns), pd.Series(y_train), pd.DataFrame(x_test, columns=columns), pd.Series(y_test)


def create_xgboost(n_estimators: int, max_depth: int, learning_rate: float, subsample: float):
    """
        init the xgboost regressor model with hyperparameters
        
        Args:
        
            n_estimators (int):
            max_depth (int):
            learning_rate (float):
            subsample (float):
            
        Returns:
        
            xgb (XGBRegressor):
                the model with custom params
        
    
    """
    
    xgb = XGBRegressor(n_estimators  = n_estimators, 
                       max_depth     =max_depth, 
                       learning_rate =learning_rate, 
                       subsample     =subsample)
    
    return xgb
    
    
def create_rfr(n_estimators: int, max_depth: int, min_samples_split: int, min_samples_leaf: int):
    """
        Init RandomForest model with corresponding params
        
        Args:
            n_estimators (int):
            
            max_depth (int):
            
            learning_rate (float):
            
            subsample (float):
        
        Returns:
        
            rfr (RandomForestRegressor):
                the model with custom params
        
    """
    rfr = RandomForestRegressor(n_estimators      = n_estimators, 
                                max_depth         = max_depth, 
                                min_samples_split = min_samples_split, 
                                min_samples_leaf  = min_samples_leaf)
    
    return rfr



args_xgb = get_args_xgboost()

x_train, y_train, x_val, y_val = load_data()

with strategy.scope():
    xgb = create_xgboost(n_estimators  = args_xgb.n_estimators, 
                            max_depth     = args_xgb.max_depth, 
                            learning_rate = args_xgb.learning_rate, 
                            subsample     = args_xgb.subsample)  

xgb.fit(x_train, y_train)

pred_xgb = xgb.predict(x_val)

hpt = hypertune.HyperTune()

hpt.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag='r2_score',
    metric_value=r2_score(y_val, pred_xgb),)
    


Overwriting trainer/task.py


# Dockerfile

Then we will need to create a Dockerfile in order to create the container and send it to Google Cloud (Artifact Registry)

In [15]:
%%writefile Dockerfile

FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-11

# Avoiding crash due to scikit-learn and xgboost installation 
RUN apt-get update && apt-get install -y \
    build-essential \
    libatlas-base-dev \
    && rm -rf /var/lib/apt/lists/*

ENV APP_HOME /app
WORKDIR $APP_HOME
COPY . ./

RUN pip install -r requirements.txt
CMD ["python", "app.py"]

Overwriting Dockerfile


# Google Cloud 

Make sure you have installed Google Cloud SDK and added to the shell

### connect to your project

In [3]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=HNn13Spn42tingQCXKOC6KIarfVyM3&access_type=offline&code_challenge=GouxY7FVODQJvlTF_JC11BQkguB8LlxT9X0MNVy77LM&code_challenge_method=S256


Credentials saved to file: [/Users/avicenne/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "trans-sunset-439207-f2" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


Updates are available for some Google C

In [8]:
PROJECT_ID = "trans-sunset-439207-f2"
BUCKET_URI = f"gs://{PROJECT_ID}-boston-house-pricing" 
LOCATION = "us-central1"
IMAGE_NAME = "xgboost-hypertune"
IMAGE_URI = f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}"

In [9]:
# Select project
!gcloud config set project $PROJECT_ID

# Check if bucket already exist
!gsutil ls $BUCKET_URI

# If not create it
! gsutil mb -l {LOCATION} {BUCKET_URI}

Updated property [core/project].


### build Docker

In [14]:
!docker build ./ -t $IMAGE_URI

[1A[1B[0G[?25l[+] Building 0.0s (0/0)  docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.0s (0/1)                                    docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 391B                                       0.0s
[0m => [internal] load metadata for gcr.io/deeplearning-platform-release/tf2  0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.4s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 391B                                       0.0s
[0m => [internal] load metadata for gcr.io/deeplearning-platform-release/tf2  0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.5s (1/2)                                    docker:desktop-li

In [None]:
!docker push $IMAGE_URI

### Vertex Ai fine tuning CustomJob

In [9]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

In [25]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
        },
        "replica_count": 1,
        "container_spec": {"image_uri": f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}"},
    }
]

metric_spec = {"r2_score": "maximize"}

parameter_spec = {
    "learning_rate": hpt.DoubleParameterSpec(min=0.4, max=0.8, scale="log"),
    "n_estimators": hpt.IntegerParameterSpec(min=50, max=100, scale="linear"),
    "max_depth": hpt.IntegerParameterSpec(min=5, max=10, scale="Linear"),
    "subsample": hpt.DoubleParameterSpec(min=0.4, max=0.8, scale="log"),
}

JOB_NAME = "house-pricing-hyperparam-job"

my_custom_job = aiplatform.CustomJob(
    display_name=JOB_NAME,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=BUCKET_URI,
)

In [None]:
# Create and run HyperparameterTuningJob

hp_job = aiplatform.HyperparameterTuningJob(
    display_name=JOB_NAME,
    custom_job=my_custom_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=15,
    parallel_trial_count=3,
)

hp_job.run()

### Get hyperparams

In [None]:
hpt_job = aiplatform.HyperparameterTuningJob.get(
        resource_name=JOB_NAME,
    )
