# K-Mer research with XGBoost
Using K-Mer counts as input for XGBoost algorithm to predict MICs.

In [1]:
# Install dependencies
import sys

!{sys.executable} -m pip install seaborn
!{sys.executable} -m pip install sagemaker==2.46.0

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting sagemaker==2.46.0
  Using cached sagemaker-2.46.0-py2.py3-none-any.whl
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.49.1
    Uninstalling sagemaker-2.49.1:
      Successfully uninstalled sagemaker-2.49.1
Successfully installed sagemaker-2.46.0
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import sagemaker
import boto3
import pandas as pd

%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

matplotlib.style.use("ggplot")

# Region to get estimator container
region = boto3.Session().region_name

# IAM role for executing the processing job.
iam_role = sagemaker.get_execution_role()

# Instance count to train on
instance_count = 5

# Type of EC2 instance to train on 
# (It is recommended to use m5 types rather than c5 types due to m5 having more memory since Machine Learning can be more Memory than CPU bound)
# (The training data is 56GB, so we need an instance that has at least that amount of memory to fit the whole dataset in memory at once)
# (XGBoost requires that the whole training dataset is in memory to train)
# (This instance has 64GB of memory)
isntance_type = "ml.m5.4xlarge"

# Type of EC2 isntance to deploy and test on
# (This does not need as much memory since we will only be running 1 sample through at a time)
deploy_instance_type = "ml.t2.medium"

# Bucket that data and output should go
bucket = "BUCKET FOR TRAINING DATA"
prefix = "PREFIX KEY FOR TRAINING DATA"

# Path to put all output files
output_path = f"{prefix}/output/"

In [3]:
print(f"Role being used: {iam_role}")
print(f"Region being used: {region}")

Role being used: arn:aws:iam::076069858788:role/JMI-Sagemaker-Notebook-Service-Role
Region being used: us-east-2


# Data
The data should already be processed through the data flow job, and exported to the S3 bucket.
We want to have the path and content type set up to pass in to the training input.

In [4]:
# Set below variable to True if there is a new dataset and it will be loaded from S3, shuffle/split into training and validation, and reuploaded unzipped.
setup_train_files = False
if setup_train_files:
    print("Downloading zip file...")
    boto3.resource("s3").Bucket(bucket).download_file(f"{prefix}/train.zip", "train.zip")
    print("zip file downloaded")
    
    print("Unzipping file...")
    !unzip train.zip
    print("file unzipped")
    
    print("Splitting into 5 files")
    # Found from: https://stackoverflow.com/a/20622193/9659107 (combined with a comment from that post)
    !split -l$((`wc -l < train.libsvm`/5)) train.libsvm train- --verbose -da 1 --additional-suffix=".libsvm"
    print("Split done")
    
    print("Uploading files to S3")
    for i in range(5):
        print(f"Uploading training file {i}")
        boto3.client("s3").upload_file(f"train-{i}.libsvm", bucket, f"{prefix}/train-{i}.libsvm")
        print(f"Training file {i} uploaded")
        
    print("Training file split and upload successfully done")

In [5]:
processed_training_input_s3_path = f"s3://{bucket}/{prefix}/train-"
validation_input_s3_path = f"s3://{bucket}/{prefix}/validation.libsvm"
train_content_type = "libsvm"


train_input = sagemaker.inputs.TrainingInput(
    s3_data=processed_training_input_s3_path,
    s3_data_type="S3Prefix",
    content_type=train_content_type,
    distribution="ShardedByS3Key"
)


validation_input = sagemaker.inputs.TrainingInput(
    s3_data=validation_input_s3_path,
    content_type=train_content_type
)



# Estimator and Training
Now we create the K-Means estimator and train it using the data and container from above.

In [6]:
container = sagemaker.image_uris.retrieve("xgboost", region, "latest")
hyperparameters = {
    "max_depth":"8",
    "eta":"0.125",
    "subsample":"1",
    "objective":"reg:linear",
    "num_round":"50",
    "tree_method": "approx"
}

estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=iam_role,
                                          instance_count=instance_count, 
                                          instance_type=isntance_type, 
                                          volume_size=64, # 64 GB (Needs to be larger than dataset since dataset will be stored in volume)
                                          output_path=f"s3://{bucket}/{output_path}",
                                          base_job_name=f"{prefix}"
                                         )

In [7]:
%%time
estimator.fit({'train': train_input, "validation": validation_input})

2021-07-27 12:01:48 Starting - Starting the training job...
2021-07-27 12:01:50 Starting - Launching requested ML instancesProfilerReport-1627387308: InProgress
...
2021-07-27 12:02:40 Starting - Preparing the instances for training.........
2021-07-27 12:04:14 Downloading - Downloading input data..........................................
2021-07-27 12:11:17 Training - Downloading the training image..[32mArguments: train[0m
[33mArguments: train[0m
[33m[2021-07-27:12:11:32:INFO] Running distributed xgboost training.[0m
[34mArguments: train[0m
[34m[2021-07-27:12:11:32:INFO] Running distributed xgboost training.[0m
[32m[2021-07-27:12:11:32:INFO] Running distributed xgboost training.[0m
[34m[2021-07-27:12:11:36:INFO] Number of hosts: 5, master IP address: 10.0.222.248, host IP address: 10.0.222.248.[0m
[34m[2021-07-27:12:11:36:INFO] Finished Yarn configuration files setup.
[0m
[36mArguments: train[0m
[36m[2021-07-27:12:11:33:INFO] Running distributed xgboost training.[0

# Deploy
Now that the model has been trained, we can deploy it to an instance. This is used to make it easier to look up the model later, and we can do testing against the endpoint if wanted.

In [8]:
%%time
estimator_predictor = estimator.deploy(initial_instance_count=1, instance_type=deploy_instance_type)

----------------!CPU times: user 243 ms, sys: 16.8 ms, total: 260 ms
Wall time: 8min 2s
