# Setting up the Environment for SageMaker

In [4]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd
import io

In [None]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'Enter-Your-Bucket-Name' # Created S3 bucket name here
file_key = 'mob_price_classification_train.csv'  # Path in your S3 bucket
print("Using bucket " + bucket)

Using bucket dapc-sagemaker-bucket


# Reading the Data from S3 Bucket

In [None]:
# Create an S3 object URI
s3_uri = f's3://Enter-Your-Bucket-Name/mob_price_classification_train.csv'
print("Using S3 URI:", s3_uri)

Using S3 URI: s3://dapc-sagemaker-bucket/mob_price_classification_train.csv


In [14]:
#Download and load the CSV file
df = pd.read_csv(s3_uri)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


# Pre-Processing the Data

In [19]:
# Create a list of all features
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [20]:
# Identify the label column
label = features.pop(-1)
label

'price_range'

In [21]:
# Store features and labels in two dataframes x and y respectively
x = df[features]
y = df[label]

In [22]:
print("Shape of features dataframes",x.shape)
# Print the features of first five records
x.head()

Shape of features dataframes (2000, 20)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [23]:
# Print the labels of first five records
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

# Performing the Train Test Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

# print shapes of train and test dataframes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1700, 20)
(300, 20)
(1700,)
(300,)


In [25]:
# Create Train and Test dataframes to be stored for further use
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

# Print shape of new dataframes
print("Shape of Train dataframe",trainX.shape)
print("Shape of Test dataframe",testX.shape)

Shape of Train dataframe (1700, 21)
Shape of Test dataframe (300, 21)


In [26]:
# Save the train and test dataframes 
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

# Data Ingestion: Send the Train and Test CSV files to S3 bucket

In [27]:
# Print the bucket name
bucket

'dapc-sagemaker-bucket'

In [28]:
# Send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://dapc-sagemaker-bucket/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://dapc-sagemaker-bucket/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


# Creating an script.py from Sagemaker documentation

In [29]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd

# Loading the model    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# script.py will execute line by line    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script. Specific to Random Forest classifier
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories. Arguments required to be passed to Sagemaker for model training
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) # default
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) # default
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST")) # default
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Writing script.py


# Sagemaker: Utilizing scripts.py

In [33]:
# Importing sagemaker's default SKLearn library
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    # created above
    entry_point="script.py",

    # ARN of a new sagemaker role (ARN of new user does not work)
    role="arn:aws:iam:::role/DAPC-SageMaker",

    # creates instance inside the Sagemaker machine
    instance_count=1,
    instance_type="ml.m5.large",

    # framework version present in the documentation, declared above
    framework_version=FRAMEWORK_VERSION,

    # name of folder after model has been trained
    base_job_name="RF-custom-sklearn",

    # hyperparameters to the RF classifier
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

# Training the model on Sagemaker

In [None]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)