# Lab 3.5 Machine Learning Pipeline

created by ACZ 202402

In [6]:
bucket='c109190a2572288l5884042t1w891377298075-labbucket-aybmua9kvngb'

In [7]:
import warnings, requests, zipfile, io
warnings.simplefilter('ignore')
import pandas as pd
from scipy.io import arff

import os
import boto3
import sagemaker
from sagemaker.image_uris import retrieve
from sklearn.model_selection import train_test_split

import appropriate modules

In [8]:
f_zip = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip'
r = requests.get(f_zip, stream=True)
Vertebral_zip = zipfile.ZipFile(io.BytesIO(r.content))
Vertebral_zip.extractall()

data = arff.loadarff('column_2C_weka.arff')
df = pd.DataFrame(data[0])

# replace categorical data with numerical values
class_mapper = {b'Abnormal':1,b'Normal':0}
df['class']=df['class'].replace(class_mapper)

# adjust label column into first column
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

# split dataset into 80/10/10 scenario
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42, stratify=test_and_validate['class'])

prefix='lab3'

train_file='vertebral_train.csv'
test_file='vertebral_test.csv'
validate_file='vertebral_validate.csv'

# crate upload to S3 function
s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False )
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

# uload to S3 database
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

hyperparams={"num_round":"42",
             "eval_metric": "auc",
             "objective": "binary:logistic"}

s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

# comparing train and validation data
data_channels = {'train': train_channel, 'validation': validate_channel}

xgb_model.fit(inputs=data_channels, logs=False)

print('ready for hosting!')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-20-02-18-53-211



2024-02-20 02:18:53 Starting - Starting the training job.
2024-02-20 02:19:07 Starting - Preparing the instances for training...............
2024-02-20 02:20:28 Downloading - Downloading input data........
2024-02-20 02:21:13 Downloading - Downloading the training image.......
2024-02-20 02:21:54 Training - Training image download completed. Training in progress....
2024-02-20 02:22:14 Uploading - Uploading generated training model..
2024-02-20 02:22:30 Completed - Training job completed
ready for hosting!


In [9]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                serializer = sagemaker.serializers.CSVSerializer(),
                instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-20-02-23-39-848
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-02-20-02-23-39-848
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-02-20-02-23-39-848


-----!

hosting the model

In [10]:
test.shape

(31, 7)

review test data info

In [11]:
test.head(5)

Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,1,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,0,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,1,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,1,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,1,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958


In [15]:
row = test.iloc[0:1,1:]
# exclude label column for first row
row.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083


In [16]:
batch_X_csv_buffer = io.StringIO()
row.to_csv(batch_X_csv_buffer, header=False, index=False)
test_row = batch_X_csv_buffer.getvalue()
print(test_row)

88.0244989,39.84466878,81.77447308,48.17983012,116.6015376,56.76608323



switch dataframe to csv string buffer

In [17]:
xgb_predictor.predict(test_row)

b'0.9966071844100952'

test result with probability score between 0-1<br>
in this case, the test used is more likely to be classified as Abnormal (1)

In [18]:
test.head(5)

Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,1,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,0,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,1,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,1,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,1,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958


the test label from first row is indeed Abnormal (1)

In [19]:
row2 = test.iloc[1:5,1:]
row2.head()
batch_X_csv_buffer = io.StringIO()
row2.to_csv(batch_X_csv_buffer, header=False, index=False)
test_row2 = batch_X_csv_buffer.getvalue()
print(test_row2)

65.61180231,23.13791922,62.58217893,42.47388309,124.1280012,-4.083298414
52.20469309,17.21267289,78.09496877,34.9920202,136.9725168,54.93913416
50.06678595,9.120340183,32.16846267,40.94644577,99.71245318,26.76669655
41.35250407,16.57736351,30.70619135,24.77514057,113.2666746,-4.497957556



In [20]:
xgb_predictor.predict(test_row2)

b'0.777283251285553,0.9946410655975342,0.993690013885498,0.9391394853591919'

2nd row is the only one with Normal label, hence has the lowest probability score between 4 dataset

In [21]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2024-02-20-02-23-39-848
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-02-20-02-23-39-848


termintating deployed model

In [22]:
batch_X = test.iloc[:,1:];
batch_X.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958


In [23]:
batch_X_file='batch-in.csv'
upload_s3_csv(batch_X_file, 'batch-in', batch_X)

In [24]:
batch_output = "s3://{}/{}/batch-out/".format(bucket,prefix)
batch_input = "s3://{}/{}/batch-in/{}".format(bucket,prefix,batch_X_file)

xgb_transformer = xgb_model.transformer(instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       strategy='MultiRecord',
                                       assemble_with='Line',
                                       output_path=batch_output)

xgb_transformer.transform(data=batch_input,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')
xgb_transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-20-02-50-05-903
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2024-02-20-02-50-06-563


.....................................[34m[2024-02-20:02:56:20:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-20:02:56:20:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-20:02:56:20:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    

In [25]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()),sep=',',names=['class'])
target_predicted.head(5)

Unnamed: 0,class
0,0.996607
1,0.777283
2,0.994641
3,0.99369
4,0.939139


In [27]:
def binary_convert(x):
    threshold = 0.80
    if x > threshold:
        return 1
    else:
        return 0

target_predicted['binary'] = target_predicted['class'].apply(binary_convert)

print(target_predicted.head(10))
test.head(10)

      class  binary
0  0.996607       1
1  0.777283       0
2  0.994641       1
3  0.993690       1
4  0.939139       1
5  0.997396       1
6  0.991977       1
7  0.987518       1
8  0.993334       1
9  0.682776       0


Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
136,1,88.024499,39.844669,81.774473,48.17983,116.601538,56.766083
230,0,65.611802,23.137919,62.582179,42.473883,124.128001,-4.083298
134,1,52.204693,17.212673,78.094969,34.99202,136.972517,54.939134
130,1,50.066786,9.12034,32.168463,40.946446,99.712453,26.766697
47,1,41.352504,16.577364,30.706191,24.775141,113.266675,-4.497958
135,1,77.121344,30.349874,77.481083,46.77147,110.611148,82.093607
100,1,84.585607,30.361685,65.479486,54.223922,108.010218,25.118478
89,1,71.186811,23.896201,43.696665,47.29061,119.864938,27.283985
297,0,45.575482,18.759135,33.774143,26.816347,116.797007,3.13191
4,1,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501


treshold used to determine if the predicted label is 0 or 1 based on its probability score

treshold value of 0.65 gives an accuracy of 80%<br>
treshold value of 0.80 gives an accuracy of 80%

In [28]:
# End