In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import pandas as pd
import boto3

sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'ekagrasbucketforsagemaker'       # Should be unique
print(f'Using Bucket: {bucket}')

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ekagra/.config/sagemaker/config.yaml
Using Bucket: ekagrasbucketforsagemaker


In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
df.shape

(2000, 21)

In [4]:
# OPTIONAL: Feature Engineering
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [5]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [6]:
df.isnull().mean()

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [8]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [9]:
label = features.pop(-1)
label

'price_range'

In [10]:
x = df[features]
y = df[label]

In [11]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [12]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [13]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.15, random_state=0)

In [15]:
print(f'xtrain: {xtrain.shape}\nytrain: {ytrain.shape}\nxtest: {xtest.shape}\nytest: {ytest.shape}')

xtrain: (1700, 20)
ytrain: (1700,)
xtest: (300, 20)
ytest: (300,)


In [16]:
trainX = pd.DataFrame(xtrain)
trainX[label] = ytrain

testX = pd.DataFrame(xtest)
testX[label] = ytest

In [17]:
print(trainX.shape)
print(testX.shape)

(1700, 21)
(300, 21)


In [19]:
trainX.to_csv('data/train-V1.csv', index=False)
testX.to_csv('data/test-V1.csv', index=False)

In [20]:
sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'

trainpath = sess.upload_data(
    path='data/train-V1.csv',
    bucket=bucket,
    key_prefix=sk_prefix
)
testpath = sess.upload_data(
    path='data/test-V1.csv',
    bucket=bucket,
    key_prefix=sk_prefix
)

print(f'trainpath: {trainpath}')
print(f'testpath: {testpath}')

trainpath: s3://ekagrasbucketforsagemaker/sagemaker/mobile_price_classification/sklearncontainer/train-V1.csv
testpath: s3://ekagrasbucketforsagemaker/sagemaker/mobile_price_classification/sklearncontainer/test-V1.csv


In [29]:
%%writefile script.py
import argparse
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
import numpy as np
import pandas as pd
from io import StringIO

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__ =='__main__':

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument('--learning-rate', type=float, default=0.05)

    # hyperparameters sent by client
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)

    # Data, model, and output directories
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='data/train-V1.csv')
    parser.add_argument('--test-file', type=str, default='data/test-V1.csv')

    args, _ = parser.parse_known_args()

    print(f'SKLearn Version: {sklearn.__version__}')
    print(f'Joblib Version: {joblib.__version__}')

    print('[INFO] Reading data\n')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    print('Building training and testing datasets\n')
    xtrain = train_df[features]
    ytrain = train_df[label]
    xtest = test_df[features]
    ytest = test_df[label]

    print('Training RandomForest Model...\n')
    model = RandomForestClassifier(n_estimators=args.n_estimators,
                                   random_state=args.random_state,
                                   verbose=1)
    model.fit(xtrain, ytrain)
    print()

    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)
    print(f'Model persisted at {model_path}')
    print()

    ypred_test = model.predict(xtest)
    test_acc = accuracy_score(ytest, ypred_test)
    test_rep = classification_report(ytest, ypred_test)

    print(f'\n---- METRICS RESULTS FOR TESTING DATA ----\n')
    print(f'Total rows are: {xtest.shape[0]}')
    print(f'[TESTING] Model accuracy is: {test_acc}')
    print(test_rep)

Overwriting script.py


In [46]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role="arn:aws:iam::851725188949:role/SageMaker",
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='RF-custom-sklearn',
    hyperparameters={
        "n_estimator": 100,
        "random_state": 0
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run = 3600
)

In [47]:
# Launch taining job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-10-18-17-26-49-012


2024-10-18 17:26:51 Starting - Starting the training job...
2024-10-18 17:27:04 Starting - Preparing the instances for training...
2024-10-18 17:27:53 Downloading - Downloading the training image......
2024-10-18 17:28:44 Training - Training image download completed. Training in progress.
2024-10-18 17:28:44 Uploading - Uploading generated training model2024-10-18 17:28:39,040 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-10-18 17:28:39,044 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-18 17:28:39,087 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-10-18 17:28:39,245 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-18 17:28:39,257 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-18 17:28:39,269 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-

In [48]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)["ModelArtifacts"]["S3ModelArtifacts"]

print(f'Model artifact persisted at: {artifact}')


2024-10-18 17:28:57 Starting - Preparing the instances for training
2024-10-18 17:28:57 Downloading - Downloading the training image
2024-10-18 17:28:57 Training - Training image download completed. Training in progress.
2024-10-18 17:28:57 Uploading - Uploading generated training model
2024-10-18 17:28:57 Completed - Training job completed
Model artifact persisted at: s3://sagemaker-eu-central-1-851725188949/RF-custom-sklearn-2024-10-18-17-26-49-012/output/model.tar.gz


In [49]:
artifact

's3://sagemaker-eu-central-1-851725188949/RF-custom-sklearn-2024-10-18-17-26-49-012/output/model.tar.gz'

In [51]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data=artifact,
    role = "arn:aws:iam::851725188949:role/SageMaker",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)

In [52]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x7f746eed6490>

In [53]:
# Endpoint Deployment
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(f'EndpointName: {endpoint_name}')

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)

EndpointName: Custom-sklearn-model-2024-10-18-17-41-37


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-10-18-17-39-10
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-10-18-17-41-37
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-10-18-17-41-37


------!

In [55]:
endpoint_name

'Custom-sklearn-model-2024-10-18-17-41-37'

In [56]:
testX[features][0:2].values.tolist()
print(predictor.predict(testX[features][0:2].values.tolist()))

[3 0]


In [57]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '84d8b315-3ea0-48ed-90ee-c3e7916750fe',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '84d8b315-3ea0-48ed-90ee-c3e7916750fe',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 18 Oct 2024 17:47:29 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}