In [53]:
import boto3
import re
import pandas as pd
import numpy as np
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

In [54]:
# S3 Bucket
bucket = 'windturbinebucket'
prefix = 'xgboost'

# IAM ROLE
role = get_execution_role()

In [55]:
# Download data set
!wget https://samick-virginia.s3.amazonaws.com/xgboost/data/wind_turbine_training_data.csv

--2022-05-21 03:24:58--  https://samick-virginia.s3.amazonaws.com/xgboost/data/wind_turbine_training_data.csv
Resolving samick-virginia.s3.amazonaws.com (samick-virginia.s3.amazonaws.com)... 52.216.164.27
Connecting to samick-virginia.s3.amazonaws.com (samick-virginia.s3.amazonaws.com)|52.216.164.27|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30337871 (29M) [text/csv]
Saving to: ‘wind_turbine_training_data.csv’


2022-05-21 03:24:59 (45.4 MB/s) - ‘wind_turbine_training_data.csv’ saved [30337871/30337871]



In [56]:
# View data set
dataset = pd.read_csv('wind_turbine_training_data.csv')
pd.set_option('display.max_rows', 6)
dataset

Unnamed: 0,turbine_id,wind_speed,RPM_blade,oil_temperature,oil_level,temperature,humidity,vibrations_frequency,pressure,wind_direction,breakdown
0,3,80,61,39,34,33,26,1,77,3,0
1,10,85,78,36,28,35,43,15,62,2,1
2,7,47,31,31,23,46,62,15,32,1,0
...,...,...,...,...,...,...,...,...,...,...,...
999997,4,42,75,25,31,42,35,5,67,2,0
999998,3,48,75,47,10,85,63,7,72,2,1
999999,10,45,60,37,8,39,35,12,64,4,1


In [57]:
# Reform
dataset = dataset.drop('turbine_id', axis=1)
dataset = pd.concat([dataset['breakdown'], dataset.drop(['breakdown'], axis=1)], axis=1)
dataset

Unnamed: 0,breakdown,wind_speed,RPM_blade,oil_temperature,oil_level,temperature,humidity,vibrations_frequency,pressure,wind_direction
0,0,80,61,39,34,33,26,1,77,3
1,1,85,78,36,28,35,43,15,62,2
2,0,47,31,31,23,46,62,15,32,1
...,...,...,...,...,...,...,...,...,...,...
999997,0,42,75,25,31,42,35,5,67,2
999998,1,48,75,47,10,85,63,7,72,2
999999,1,45,60,37,8,39,35,12,64,4


In [58]:
# 
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [59]:
# Upload to S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'data/validation/validation.csv')).upload_file('validation.csv')
# Assign the data path for SageMaker 
s3_input_train = TrainingInput(s3_data='s3://{}/{}/data/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/data/validation/'.format(bucket, prefix), content_type='csv')

In [60]:
# Conatiners
containers = {
              'us-east-1':'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1'
             }

# 创建 Sagemaker Session
sess = sagemaker.Session()

In [61]:
# 
xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.xlarge',
                                    output_path='s3://{}/{}/model'.format(bucket, prefix),
                                    sagemaker_session=sess)

In [62]:
# 
xgb.set_hyperparameters(eta=0.1, objective='binary:logistic', num_round=25) 
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-05-21 03:25:07 Starting - Starting the training job...
2022-05-21 03:25:31 Starting - Preparing the instances for trainingProfilerReport-1653103507: InProgress
.........
2022-05-21 03:26:58 Downloading - Downloading input data...
2022-05-21 03:27:18 Training - Downloading the training image...
2022-05-21 03:28:03 Training - Training image download completed. Training in progress..[34m[2022-05-21 03:28:06.354 ip-10-2-70-60.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determine

In [63]:
# Endpoint spawn
xgb_predictor = xgb.deploy(
	initial_instance_count = 1,
	instance_type = 'ml.m5.xlarge',
	serializer = CSVSerializer())

------!

In [64]:
# Check Endpoint
print (xgb_predictor.endpoint_name)

sagemaker-xgboost-2022-05-21-03-29-36-936
