In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3 #used to connect to s3 bucket
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'bmmlsagemakerbucket'
print("Bucket Used = "+ bucket)

In [None]:
df = pd.read_csv("mobile_price_range_data.csv")
df.columns
df['price_range'].value_counts()

feats = list(df.columns)
feats

label = feats.pop(-1)

x=df[feats]
y=df[label]

X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.85,random_state=0)
print("X_train.shape",X_train.shape)
print("y_train.shape",y_train.shape)
print("X_test.shape",X_test.shape)
print("y_test.shape",y_test.shape)

trainX = pd.DataFrame(X_train)
trainX[label]=y_train
testX = pd.DataFrame(X_test)
testX[label]=y_test

#Saving files in csv format to s3 bucket

trainX.to_csv("train-V-1.csv",index=False)
testX.to_csv("test-V-1.csv",index=False)

#saving data to s3 here
sess = sagemaker.Session()
bucket = 'bmmlsagemakerbucket'
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"

trainpath = sess.upload_data(path="train-V-1.csv",bucket=bucket,key_prefix=sk_prefix)
testpath = sess.upload_data(path="test-V-1.csv",bucket=bucket,key_prefix=sk_prefix)


print(trainpath)
print(testpath)

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score
import sklearn
import pandas as pd
import numpy as np
import joblib
import boto3
import pathlib 
from io import  StringIO
import argparse
import os

# function to load model
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir,"model.joblib"))
    return clf

#sagemaker needs some by default argumnets

if __name__ =='__main__':

    parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)


# Data, model, and output directories  parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train_file', type=str, default="train-V-1.csv")
    parser.add_argument('--test_file', type=str, default="test-V-1.csv")
    
    args, _ = parser.parse_known_args()

    print("SKlearn version = ",sklearn.__version__)
    
    print("Joblib version = ",joblib.__version__)

# . .. load from args.train and args.test, train a model, write model to args.model_dir.

    train_df = pd.read_csv(os.path.join(args.train,args.train_file))
    test_df = pd.read_csv(os.path.join(args.test,args.test_file))

    features = list(train_df.columns)

    label = features.pop(-1)

    print("Building training and testing datasets")

    X_train = train_df[features]
    y_train = train_df[label]
    X_test = test_df[features]
    y_test = test_df[label]
    print()
    print("X_train.shape",X_train.shape)
    print("y_train.shape",y_train.shape)
    print("X_test.shape",X_test.shape)
    print("y_test.shape",y_test.shape)

    print()
    model = RandomForestClassifier(n_estimators = args.n_estimators , random_state = args.random_state, verbose = True)
    model.fit(X_train,y_train)

    print()

    model_path = os.path.join(args.model_dir,"model.joblib")
    joblib.dump(model,model_path)
    print("Model loaded at " + model_path)

    print()

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_classscore = classification_report(y_test,y_pred_test)

    print()

    print("Test Accuracy is",test_acc)
    
    print("Classification Report is",test_classscore)

In [None]:
from sagemaker.sklearn.estimator import SKLearn
import joblib
FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::your_arn_num:role/SageMaker",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    hyperparameters={
        "n_estimators": 100,
        "random_state":0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [None]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.job_name)["ModelArtifacts"]["S3ModelArtifacts"]


print("Model artifact persisted at"+artifact)

In [None]:
from sagemaker.sklearn.model import SKLearnModel

from time import gmtime, strftime

#model_name is the name of the folder in the s3 buckets list insie the parent folder
model_name = "Custom-SKLearn-model-" + strftime("%Y-%m-%d-%H-%M-%S",gmtime())

model = SKLearnModel(
    name = model_name,
    model_data = artifact,
    role="arn:aws:iam::arnnumber:role/SageMaker",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
endpoint_name = "Custom-SKLearn-model-" + strftime("%Y-%m-%d-%H-%M-%S",gmtime())
print("Endpoint name is {}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
)

In [None]:
p =testX[feats][0:3].values.tolist()
print(predictor.predict(p))

In [None]:
sm_boto3.delete_endpoint(EndpointName = endpoint_name)