Star Galaxy classification using the Sklearn custom script in SageMaker.

In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
#this boto3 library is use to connect S3 bucket:
import boto3
import pandas as pd

In [None]:
#stargalaxybucketsagemaker
sm_boto3 = boto3.client("sagemaker", region_name='us-east-1')
sess = sagemaker.Session(boto3.session.Session(region_name='us-east-1'))
region = sess.boto_session.region_name
bucket = 'name_of_bucket'# mention created S3 bucket name 
print('using S3 bucket '+ bucket)

In [None]:
df = pd.read_csv('star_classification.csv')
df.head()

In [None]:
df.shape 

In [None]:
df['class'].value_counts(normalize=True)

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df1 = df.drop(columns=["obj_ID","run_ID","rerun_ID","cam_col","field_ID","spec_obj_ID","plate","MJD","fiber_ID"])
df1

In [None]:
label = df1['class']
df1 = df1.drop(columns=['class'])
features = list(df1.columns)
features

In [None]:
label

In [None]:
x = df1[features]
y = label
x.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
trainX = pd.DataFrame(X_train)
trainX['label'] = y_train

testX = pd.DataFrame(X_test)
testX['label'] = y_test

In [None]:
print(trainX.shape)
print(testX.shape)

In [None]:
trainX.head()

In [None]:
testX.head()

In [None]:
trainX.to_csv("train-v-1.csv", index = False)
testX.to_csv("test-v-1.csv", index = False)

In [None]:
#send data to S3. SageMaker will take training data from S3.
sk_prefix = "sagemaker/star_galaxy_classification/sklearncontainer"
trainpath = sess.upload_data(
    path = "train-v-1.csv", bucket = bucket, key_prefix = sk_prefix
)
testpath = sess.upload_data(
    path = "test-v-1.csv", bucket = bucket, key_prefix = sk_prefix
)
print(trainpath)
print(testpath)

In [None]:
%%writefile script.py 

import sklearn 
import joblib
import boto3
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import argparse
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "main":
    print("Extracting arguments... ")
    parser = argparse.ArgumentParser()

    #hyperparameter sent by the client are passed as command line arguments to the script.
    parser.add_argument("--n_estimator", type=int, default= 100)
    parser.add_argument("--random_state", type=int, default = 0)

    #data, model, output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-v-1.csv")
    parser.add_argument("--test-file", type=str, default="test-v-1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn version:", sklearn.__version__)
    print("joblib version:", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("column oreder:")
    print(features)
    print()

    print("label column is:", label)
    print()

    print("data shape")
    print()
    print("-----shape of traning data(80%)-----") 
    print(X_train.shape)
    print(y_train.shape)
    print("-----shape of testing data(20%)-----")
    print(X_test.shape)
    print(y_test.shape)
    print()

    print("Training Random Forest Model......")
    print()
    model = RandomForestClassifier(n_estimators = args.n_estimators, random_state=args.random_state, verbose = args.verbose)
    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("model persisted at " + model_path)
    print()

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("---- metrics results for testing data ----")
    print()
    print("total rows are: ", X_test.shape[0])  
    print('[testing] model accuracy is: ', test_acc)
    print('[testing] testing report: ')
    print(test_rep)  

In [None]:
from sagemaker.sklearn.estimator import SKLearn
FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    #role = "arn:aws:iam::336195629133:role/service-role/AmazonSageMaker-ExecutionRole-20230825T003814",
    instance_count = 1,
    instance_type = "ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name = "RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
    )

In [None]:
#launch training job with asynchronous call
sklearn_estimator.fit({"train":trainpath, "test":testpath}, wait = True)
#sklearn_estimator.fit({"train":datapath}, wait=True)   

In [None]:
sklearn_estimator.latest_training_job.wait(logs = "None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName = sklearn_estimator.latest_training_job.name 
)["ModelArtifacts"]["S3ModelArtifacts"]

print("model artifact persisted at " +  artifact)   

In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

# model_name = "Custom-sklearn-model-"+strftime("%Y-%m-%d-%H-%M-%S", gmtime())
# new_model = SKLearnModel(       
#     name = model_name,
#     model_data = artifact,
#     role = "arn:aws:iam::336195629133:role/service-role/AmazonSageMaker-ExecutionRole-20230825T003814",
#     entry_point="script.py",
#     framework_version=FRAMEWORK_VERSION
# )

In [None]:
model_name

In [None]:
endpoint_name = "Custom-sklearn-model-"+strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Endpoint_name = {}".format(endpoint_name))

predictor = new_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.Xlarge",
    endpoint_name=endpoint_name,
)