In [7]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3 #used to connect the S3 bucket
import pandas as pd

con_boto3 = boto3.client("sagemaker")
sm_session = sagemaker.Session()
region = sm_session.boto_session.region_name
bucket = 'mobsagemakerbuck' # S3 bucket that is created
print("Bucket being used - "+ bucket)


Bucket being used - mobsagemakerbuck


### Read the Data File

In [8]:
data = pd.read_csv('train.csv')

In [9]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [10]:
data.shape

(2000, 21)

In [11]:
data['price_range'].value_counts(normalize=True)

1    0.25
2    0.25
3    0.25
0    0.25
Name: price_range, dtype: float64

In [12]:
data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [13]:
#checking percentage of missing values in data
data.isnull().mean()*100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [14]:
features = list(data.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [15]:
target = features[-1]

In [16]:
x = data[features]
y = data[target]

In [17]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [18]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1700, 21)
(300, 21)
(1700,)
(300,)


In [21]:
trainX = pd.DataFrame(X_train)
trainX[target] = y_train
testX = pd.DataFrame(X_test)
testX[target] = y_test

In [22]:
print(trainX.shape)
print(testX.shape)

(1700, 21)
(300, 21)


In [23]:
trainX.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1452,1450,0,2.1,0,1,0,31,0.6,114,5,...,1573,1639,794,11,5,9,0,1,1,1
1044,1218,1,2.8,1,3,0,39,0.8,150,7,...,1122,1746,1667,10,0,12,0,0,0,1
1279,1602,0,0.6,0,12,0,58,0.4,170,1,...,1259,1746,3622,17,2,17,0,1,1,3
674,1034,0,2.6,1,2,1,45,0.3,190,3,...,182,1293,969,15,1,7,1,0,0,0
1200,530,0,2.4,0,1,0,32,0.3,88,6,...,48,1012,959,17,7,6,0,1,0,0


In [24]:
trainX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [25]:
testX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [26]:
trainX.to_csv("train-V-01.csv",index = False)
testX.to_csv("test-V-01.csv",index = False)

In [27]:
# upload the train and test file to S3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sm_session.upload_data(path="train-V-01.csv", bucket=bucket, key_prefix=sk_prefix)
testpath = sm_session.upload_data(path="test-V-01.csv", bucket=bucket, key_prefix=sk_prefix)
print(trainpath)
print(testpath)

s3://mobsagemakerbuck/sagemaker/mobile_price_classification/sklearncontainer/train-V-01.csv
s3://mobsagemakerbuck/sagemaker/mobile_price_classification/sklearncontainer/test-V-01.csv


In [28]:
#create a script.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ =='__main__':
    
    
    print("[INFO] Extracting Arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)

    # input data and model directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default="train-V-01.csv")
    parser.add_argument('--test-file', type=str, default="test-V-01.csv")

    args, _ = parser.parse_known_args()
    
    print("SKlearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    
    print("[INFO] Reading Data")
    print()
    train_data = pd.read_csv(os.path.join(args.train, args.train_file))
    test_data = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_data.columns)
    target = features[-1]
    
    print("Building Training and Testing Datasets")
    print()
    X_train = train_data[features]
    X_test = test_data[features]
    y_train = train_data[target]
    y_test = test_data[target]
    
    print("Column Order: ")
    print(features)
    print()
    
    print("Target Column: ", target)
    print()
    
    print("Data Shape: ")
    print()
    
    print("SHAPE OF TRAINING DATA (85%) --->")
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    print()
    
    
    print("Random Forest Model Building --->")
    print()
    
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose=0)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    
    print("Model is stored at "+model_path)
    print()
    
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_report = classification_report(y_test, y_pred)
    
    print()
    print("Performance Metrics for the Testing Data --->")
    print()
    print("Total rows are: ", X_test.shape[0])
    print("Model Test Accuracy: ", test_accuracy)
    print("Model Test Report: ")
    print(test_report)

[INFO] Extracting Arguments
SKlearn Version:  1.2.1
Joblib Version:  1.1.1
[INFO] Reading Data



TypeError: expected str, bytes or os.PathLike object, not NoneType

In [29]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::673358967999:role/service-role/AmazonSageMaker-ExecutionRole-20230708T012797",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators":100,
        "random_state":0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
    
) 

In [30]:
# launch training job, with asynchronous call

sklearn_estimator.fit({"train": trainpath, "test":testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2023-07-08-15-33-36-848


Using provided s3_resource
2023-07-08 15:33:37 Starting - Starting the training job...
2023-07-08 15:33:50 Starting - Preparing the instances for training.........
2023-07-08 15:35:29 Downloading - Downloading input data...
2023-07-08 15:35:59 Training - Downloading the training image..[34m2023-07-08 15:36:20,065 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-07-08 15:36:20,069 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-07-08 15:36:20,115 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-07-08 15:36:20,284 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-07-08 15:36:20,296 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-07-08 15:36:20,308 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-07-08 15:3

In [31]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = con_boto3.describe_training_job(
TrainingJobName=sklearn_estimator.latest_training_job.name)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at "+artifact)


2023-07-08 15:36:35 Starting - Preparing the instances for training
2023-07-08 15:36:35 Downloading - Downloading input data
2023-07-08 15:36:35 Training - Training image download completed. Training in progress.
2023-07-08 15:36:35 Uploading - Uploading generated training model
2023-07-08 15:36:35 Completed - Training job completed
Model artifact persisted at s3://sagemaker-eu-west-2-673358967999/RF-custom-sklearn-2023-07-08-15-33-36-848/output/model.tar.gz


In [35]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S",gmtime())
model = SKLearnModel(
    name = model_name,
    model_data=artifact,
    role="arn:aws:iam::673358967999:role/service-role/AmazonSageMaker-ExecutionRole-20230708T012797",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)

In [36]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x15aca3ac0>

In [37]:
#endpoint deployment
endpoint_name = "Custom-sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S",gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model2023-07-08-15-47-52


INFO:sagemaker:Creating model with name: Custom-sklearn-model2023-07-08-15-47-42
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model2023-07-08-15-47-52
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model2023-07-08-15-47-52


-----!

In [39]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x15b21e980>

In [42]:
testX[features][0:2].values.tolist()

[[1454.0,
  1.0,
  0.5,
  1.0,
  1.0,
  0.0,
  34.0,
  0.7,
  83.0,
  4.0,
  3.0,
  250.0,
  1033.0,
  3419.0,
  7.0,
  5.0,
  5.0,
  1.0,
  1.0,
  0.0,
  3.0],
 [1092.0,
  1.0,
  0.5,
  1.0,
  10.0,
  0.0,
  11.0,
  0.5,
  167.0,
  3.0,
  14.0,
  468.0,
  571.0,
  737.0,
  14.0,
  4.0,
  11.0,
  0.0,
  1.0,
  0.0,
  0.0]]

In [44]:
#predicting output based on the predictor created
print(predictor.predict(testX[features][0:2].values.tolist()))

[3 0]


In [45]:
con_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '629d28a6-5fee-4f08-a222-d69277448c22',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '629d28a6-5fee-4f08-a222-d69277448c22',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 08 Jul 2023 15:56:03 GMT'},
  'RetryAttempts': 0}}