# Create Rest API for Sagemaker Endpoint using Lambda & API Gateway

In this notebook:

* Deploy a machine learning model using Random Forest Regressor to predict house prices using the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset, present in Scikit-Learn. 

This notebook is run in Sagemaker Studio

## Deploy the Sagemaker Endpoint

#### (i) Setup

In [2]:
# import libraries
import boto3
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

import datetime
import time
import tarfile

# initialise Boto3 SDK
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name

bucket = "bucket_name" # Add the name of the S3 bucket here
print("Using bucket " + bucket)

#### (ii) Train and test split

In [4]:
data = fetch_california_housing()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=13)

#### (iii) Create CSV Files for Train and Test Data

In [None]:
trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test

In [None]:
trainX.to_csv("california_housing_train.csv")
testX.to_csv("california_housing_test.csv")

#### (iv) Upload test and train data into S3 Bucket

In [None]:
sk_prefix = "sagemaker/california_housing/sklearncontainer"
trainpath = sess.upload_data(path="california_housing_train.csv", 
                             bucket=bucket, 
                             key_prefix=sk_prefix)
testpath = sess.upload_data(path="california_housing_test.csv", 
                            bucket=bucket, 
                            key_prefix=sk_prefix)

#### (v) Create training script

In [None]:
%%writefile script.py

import argparse
import joblib
import os
import sklearn
import numpy as np
import pandas as pd
import boto3
import pathlib
from io import StringIO 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# inference functions ---------------

def input_fn(request_body, request_content_type):
    print(request_body)
    print(request_content_type)
    if request_content_type == "text/csv":
        request_body = request_body.strip()
        try:
            df = pd.read_csv(StringIO(request_body), header=None)
            return df
        
        except Exception as e:
            print(e)
    else:
        return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

def predict_fn(input_data, model):
    if type(input_data) != str:
        prediction = model.predict(input_data)
        print(prediction)
        return prediction
    else:
        return input_data


if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="california_housing_train.csv")
    parser.add_argument("--test-file", type=str, default="california_housing_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("Building training and testing datasets")
    print()
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    print("Training Random Forest model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("Model persisted at " + path)
    print()

    y_pred_test = model.predict(X_test)
    test_mse = mean_squared_error(y_test,y_pred_test)
    test_r2 = r2_score(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Mean Squared Error: ', test_mse)
    print('[TESTING] Testing Model R^2 Score: ', test_r2)

#### (v) Run script inside this notebook

In [None]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude' \
                   --target target

(vi) Train inside Sagemaker container

In [None]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude",
        "target": "target",
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [None]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

#### (vii) Store Model Artifacts into the S3 Bucket

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

#### (viii) Deploy SageMaker Endpoint (API) for trained model and test it. 

In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

In [1]:
# Run this line to delete the endpoint
# sm_boto3.delete_endpoint(EndpointName=endpoint_name)

In [None]:

import requests

url = "<api_url>"

payload = "4.7, 27, 6.5, 1, 1000, 3, 37.54, -121.72"

headers = {
    'Content-Type': 'text/csv'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)    

