# training_job med train test

In [None]:
import argparse
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib
import boto3
from io import StringIO
import logging

logging.basicConfig(level=logging.INFO)

# reads train and test data from s3
def read_from_s3(bucket_name, file_name):
    s3 = boto3.client("s3")
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    return pd.read_csv(StringIO(obj["Body"].read().decode("utf-8")))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--bucket-name", type=str, required=True)
    parser.add_argument("--xtrain-key", type=str, required=True)
    parser.add_argument("--xtest-key", type=str, required=True)
    parser.add_argument("--ytrain-key", type=str, required=True)
    parser.add_argument("--ytest-key", type=str, required=True)

    args = parser.parse_args()

    X_train = read_from_s3(args.bucket_name, args.xtrain_key)
    X_test = read_from_s3(args.bucket_name, args.xtest_key)
    y_train = read_from_s3(args.bucket_name, args.ytrain_key)
    y_test = read_from_s3(args.bucket_name, args.ytest_key)

    model = RandomForestRegressor()

    model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    logging.info(f"Accuracy: {accuracy * 100:.2f}%")

    joblib.dump(model, "/opt/ml/model/model.joblib")
    logging.info("Model saved to /opt/ml/model/model.joblib")


# Deploy.py med train test

In [None]:
import sagemaker
from sagemaker.sklearn.model import SKLearnModel
import os

if __name__ == "__main__":
    sagemaker_session = sagemaker.Session()
    role = "arn:aws:iam::796717305864:role/bike-scrapper-sagemaker-role"

    model_data = 's3://sagemaker-eu-north-1-796717305864/sagemaker-scikit-learn-2023-11-07-11-16-50-623/output/model.tar.gz'
    inference_script_path = os.path.join(os.getcwd(), "bike_data_scraper", "sagemaker", "inference.py")

    sklearn_model = SKLearnModel(model_data=model_data,
                                 role=role,
                                 framework_version='0.23-1',
                                 py_version='py3',
                                 entry_point=inference_script_path)

    # Deploy the model to an endpoint with increased volume size
    predictor = sklearn_model.deploy(instance_type="ml.m5.2xlarge",
                                     initial_instance_count=1,
                                     endpoint_name="random-forest-endpoint-1")

    print("Endpoint successfully created \nName: {}".format(predictor.endpoint_name))
