# Final Project: XGBoost Model Training and Deployment on AWS SageMaker

This project demonstrates an end-to-end machine learning pipeline using Amazon SageMaker to train, tune, and deploy an XGBoost model for a classification task.


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sagemaker.xgboost import XGBoost
from sagemaker.model import Model
import boto3


### Data Preprocessing

We start by loading the dataset and performing basic preprocessing steps, such as dropping unnecessary columns, handling missing values, and extracting relevant features.


In [None]:
from sklearn.datasets import fetch_openml

# Download the SBA-Loans-Case-Data-Set from OpenML
data = fetch_openml(data_id=43539, as_frame=True)
df = data.data

# Inspect the first few rows
df.head()


df = pd.read_csv('data/raw_data.csv')
df.drop(columns=['Selected', 'ChgOffDate', 'LoanNr_ChkDgt', 'Name'], inplace=True)
df['NAICS'] = df['NAICS'].astype(str).str[:2].astype(int)
df.dropna(axis=0, inplace=True)
df.to_csv('data/processed_data.csv', index=False)


### Model Training

Next, we split the data into training and validation sets, train an XGBoost model, and save the model for later use.


In [None]:
X = df.drop(columns=['Default'])
y = df['Default']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10)
model.save_model('xgb_model.json')


### Hyperparameter Tuning

We use Amazon SageMaker to perform hyperparameter tuning, optimizing the model's performance by searching for the best hyperparameters.


In [None]:
xgb_estimator = XGBoost(entry_point='train.py',
                        role='your-role-arn',
                        instance_count=1,
                        instance_type='ml.m5.xlarge',
                        framework_version='1.3-1',
                        py_version='py3')

hyperparameter_ranges = {
    'eta': ContinuousParameter(0, 1),
    'min_child_weight': ContinuousParameter(1, 10),
    'max_depth': IntegerParameter(1, 10)
}

tuner = HyperparameterTuner(estimator=xgb_estimator,
                            objective_metric_name='validation:logloss',
                            hyperparameter_ranges=hyperparameter_ranges,
                            max_jobs=10,
                            max_parallel_jobs=2)

tuner.fit({'train': 's3://your-bucket/train', 'validation': 's3://your-bucket/validation'})


### Model Deployment

After tuning the hyperparameters, we deploy the trained model to an AWS SageMaker endpoint to make it accessible for inference.


model = Model(model_data='s3://your-bucket/xgb_model.tar.gz',
              role='your-role-arn',
              image_uri=sagemaker.image_uris.retrieve('xgboost', sagemaker.Session().boto_region_name, version="1.3-1"))

predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge', endpoint_name='your-endpoint-name')


### Rollback Testing

We test the deployment guardrails by sending test traffic to the endpoint under both a failed and successful rollback scenario.


In [None]:
# Simulating a failed deployment scenario
def invoke_endpoint(endpoint_name, max_invocations=600, wait_interval_sec=1, should_raise_exp=False):
    print(f"Sending test traffic to the endpoint {endpoint_name}. \nPlease wait...")

    count = 0
    with open("test_X_numeric.csv", "r") as f:
        for row in f:
            payload = row.rstrip("\n")
            try:
                response = sm_runtime.invoke_endpoint(
                    EndpointName=endpoint_name, ContentType="text/csv", Body=payload
                )
                response["Body"].read()
                print(".", end="", flush=True)
            except Exception as e:
                print("E", end="", flush=True)
                if should_raise_exp:
                    raise e
            count += 1
            if count > max_invocations:
                break
            time.sleep(wait_interval_sec)

    print("\nDone!")

invoke_endpoint("DEMO-Deployment-Guardrails-Canary-2024-04-15-18-01-04")


In [None]:
# Simulating a successful deployment scenario
invoke_endpoint("DEMO-Deployment-Guardrails-Canary-2024-04-15-18-01-04")


### Inference

Finally, we send a payload to the deployed endpoint to perform inference and obtain predictions.


runtime = boto3.client('sagemaker-runtime')
response = runtime.invoke_endpoint(EndpointName='your-endpoint-name', ContentType='text/csv', Body='your-csv-data-here')
print(response['Body'].read())
