In [1]:
import sagemaker
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import boto3

In [3]:
sm_boto3 = boto3.client("sagemaker",region_name='us-east-1')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'mobilebbucketsagemaker'
print("Using bucket: ", bucket)

Using bucket:  mobilebbucketsagemaker


In [4]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
df.shape

(2000, 21)

In [6]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [7]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [8]:
label = features.pop(-1)
label

'price_range'

In [9]:
x = df[features]
y = df[label]

In [10]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [11]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [12]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=0)

In [13]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [14]:
trainX.to_csv("train-V-1.csv",index=False)
testX.to_csv("test-V-1.csv",index=False)

In [16]:
sk_predix = "sagemaker/mobile_price_classification/sklearncontainer"

trainpath = sess.upload_data(
    path = "train-V-1.csv",bucket=bucket,key_prefix=sk_predix
)

testpath = sess.upload_data(
    path = "test-V-1.csv",bucket=bucket,key_prefix=sk_predix
)
print(trainpath)
print(testpath)

s3://mobilebbucketsagemaker/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://mobilebbucketsagemaker/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


In [29]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir,"model.joblib"))
    return clf

if __name__ =='__main__':

    parser = argparse.ArgumentParser()
    

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)
    

    # input data and model directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
    parser.add_argument('--train-file', type=str, default=os.environ['train-V-1.csv'])
    parser.add_argument('--test-file', type=str, default=os.environ['test-V-1.csv'])

    args, _ = parser.parse_known_args()
    
    train_df = pd.read_csv(os.path.join(args.train,args.train_file))
    test_df = pd.read_csv(os.path.join(args.test,args.test_file))
    
    features = list(df.columns)
    label = features.pop(-1)
    
    model = RandomForestClassifier(n_estimators = args.n_estimators,random_state = args.random_state,verbose=True)
    model.fit(X_train,y_train)
    print()
    
    model_path = os.path.join(args.model_dir,"model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at" + model_path)
    print()
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    print("Accuracy",acc)    

Writing script.py


In [30]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::703301998095:role/Awssagemaker",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators":100,
        "random_state":0
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
) 


In [None]:
sklearn_estimator.fit({"train":trainpath,"test":testpath},wait=True)

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName = sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]
print(artifact)

In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime,strftime

model_name = "Custom-sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S",gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = artifact,
    role = "arn:aws:iam::703301998095:role/Awssagemaker",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)

In [None]:
model