In [3]:
import warnings, requests, zipfile, io
warnings.simplefilter('ignore')
import pandas as pd
from scipy.io import arff
import boto3

In [4]:
zipzip = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip'
rr = requests.get(zipzip,stream=True)
Vertebra = zipfile.ZipFile(io.BytesIO(rr.content))
Vertebra.extractall()

In [5]:
data = arff.loadarff('column_2C_weka.arff')
df = pd.DataFrame(data[0])
df

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.254400,b'Abnormal'
1,39.056951,10.060991,25.015378,28.995960,114.405425,4.564259,b'Abnormal'
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,b'Abnormal'
3,69.297008,24.652878,44.311238,44.644130,101.868495,11.211523,b'Abnormal'
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,b'Abnormal'
...,...,...,...,...,...,...,...
305,47.903565,13.616688,36.000000,34.286877,117.449062,-4.245395,b'Normal'
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.421010,b'Normal'
307,61.446597,22.694968,46.170347,38.751628,125.670725,-2.707880,b'Normal'
308,45.252792,8.693157,41.583126,36.559635,118.545842,0.214750,b'Normal'


In [6]:
classmap = {b'Abnormal':1,b'Normal':0}
df['class'] = df['class'].replace(classmap)
df

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.254400,1
1,39.056951,10.060991,25.015378,28.995960,114.405425,4.564259,1
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,1
3,69.297008,24.652878,44.311238,44.644130,101.868495,11.211523,1
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,1
...,...,...,...,...,...,...,...
305,47.903565,13.616688,36.000000,34.286877,117.449062,-4.245395,0
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.421010,0
307,61.446597,22.694968,46.170347,38.751628,125.670725,-2.707880,0
308,45.252792,8.693157,41.583126,36.559635,118.545842,0.214750,0


In [7]:
df.shape

(310, 7)

In [8]:
df.columns

Index(['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle',
       'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class'],
      dtype='object')

In [10]:
cols = df.columns.tolist()
cols

['pelvic_incidence',
 'pelvic_tilt',
 'lumbar_lordosis_angle',
 'sacral_slope',
 'pelvic_radius',
 'degree_spondylolisthesis',
 'class']

In [11]:
cols = cols[-1:] + cols[:-1]
cols

['class',
 'pelvic_incidence',
 'pelvic_tilt',
 'lumbar_lordosis_angle',
 'sacral_slope',
 'pelvic_radius',
 'degree_spondylolisthesis']

In [12]:
df.columns

Index(['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle',
       'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class'],
      dtype='object')

In [13]:
from sklearn.model_selection import train_test_split
train, testval = train_test_split(df,test_size=0.2,random_state = 5,stratify=df['class'])
test,val = train_test_split(testval,test_size=0.5,random_state=30,stratify=testval['class'])

In [14]:
print(train.shape,test.shape,val.shape)

(248, 7) (31, 7) (31, 7)


In [15]:
train.value_counts()

pelvic_incidence  pelvic_tilt  lumbar_lordosis_angle  sacral_slope  pelvic_radius  degree_spondylolisthesis  class
26.147921         10.759454    14.000000              15.388468     125.203296     -10.093108                1        1
68.721910         49.431864    68.056012              19.290046     125.018517      54.691289                1        1
65.611802         23.137919    62.582179              42.473883     124.128001     -4.083298                 0        1
65.665347         10.540675    56.489135              55.124672     109.162777      53.932020                1        1
65.755679         9.832874     50.822895              55.922805     104.394959      39.307212                1        1
                                                                                                                     ..
50.825029         9.064729     56.300000              41.760300     78.999454       23.041524                1        1
50.912440         23.015169    47.000000     

In [16]:
test.value_counts()

pelvic_incidence  pelvic_tilt  lumbar_lordosis_angle  sacral_slope  pelvic_radius  degree_spondylolisthesis  class
33.041688         -0.324678    19.071075              33.366366     120.388611      9.354365                 0        1
67.289712          16.717514   51.000000              50.572198     137.591778      4.960344                 0        1
88.623908          29.089453   47.564262              59.534455     121.764780      51.805899                1        1
85.680950          38.650035   82.680977              47.030914     120.840707      61.959034                1        1
85.581710          30.457039   78.231379              55.124672     114.866049      68.376122                1        1
85.352315          15.844910   71.668660              69.507405     124.419787      76.020603                1        1
83.396606          34.310989   78.423293              49.085617     110.466516      49.672096                1        1
82.406524          29.276422   77.054565     

In [17]:
val.value_counts()

pelvic_incidence  pelvic_tilt  lumbar_lordosis_angle  sacral_slope  pelvic_radius  degree_spondylolisthesis  class
31.484218          7.826221    24.284818              23.657997     113.833145      4.393080                 1        1
58.828379          37.577873   125.742385             21.250506     135.629418      117.314683               1        1
84.998956          29.610098   83.352194              55.388858     126.912990      71.321175                1        1
84.974132          33.021175   60.859873              51.952957     125.659534      74.333409                1        1
81.754419          20.123466   70.560440              61.630954     119.425086      55.506889                1        1
80.074914          48.069531   52.403439              32.005383     110.709912      67.727316                1        1
74.005541          21.122402   57.379502              52.883139     120.205963      74.555166                1        1
72.955644          19.576971   61.007071     

In [18]:
bucket='c69403a1374438l3351577t1w030173793308-labbucket-1t9iagdjyv91g'

prefix='lab3'

train_file='vertebral_train.csv'
test_file='vertebral_test.csv'
validate_file='vertebral_validate.csv'

import os

s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False)
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

In [21]:
upload_s3_csv(train_file,'train',train)
upload_s3_csv(test_file,'test',test)
upload_s3_csv(validate_file,'validation',val)

In [22]:
import boto3
from sagemaker.image_uris import retrieve
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

In [23]:
hyperparams = {
    "num_round":"42",
    "eval_metric":"auc",
    "objective":"binary:logistic"
}

In [24]:
import sagemaker
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

In [25]:
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

In [26]:
xgb_model.fit(inputs=data_channels['train'],logs=False)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-01-01-16-42-40-631



2023-01-01 16:42:40 Starting - Starting the training job.....
2023-01-01 16:43:10 Starting - Preparing the instances for training....................
2023-01-01 16:44:57 Downloading - Downloading input data.......
2023-01-01 16:45:37 Training - Training image download completed. Training in progress....
2023-01-01 16:45:53 Uploading - Uploading generated training model
2023-01-01 16:45:59 Failed - Training job failed


UnexpectedStatusException: Error for Training job sagemaker-xgboost-2023-01-01-16-42-40-631: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/training.py", line 94, in main
    train(framework.training_env())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/training.py", line 90, in train
    run_algorithm_mode()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/training.py", line 56, in run_algorithm_mode
    train_path = os.environ[sm_env_constants.SM_CHANNEL_TRAIN]
  File "/miniconda3/lib/python3.7/os.py", line 681, in __getitem__
    raise KeyError(key) from None
KeyError: 'SM_CHANNEL_TRAIN'

'SM_CHANNEL_TRAIN', exit code: 1