In [1]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'cyrille-dscrap-bucket'
file_name = 'processed/station_bikes/2023-09-10-2023-09-24/StationaryStations.csv'

try:
    s3.download_file(bucket_name, file_name, 'local-file-name')
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: The SSO session associated with this profile has expired or is otherwise invalid. To refresh this SSO session run aws sso login with the corresponding profile.


In [None]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing



In [5]:
data = fetch_california_housing()
data.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn
import os
import boto3

if __name__ == "__main__":
    role = "arn:aws:iam::796717305864:role/bike-scrapper-sagemaker-role"
    sagemaker_session = sagemaker.Session(
        boto_session=boto3.Session(region_name="eu-north-1")
    )
    train_path = os.path.join(os.getcwd(), "bike_data_scraper", "sagemaker", "training_job.py")

    bucket_name = "sagemaker-eu-north-1-796717305864"
    xtrain_key = "sagemaker/sklearncontainer/xtrain2.csv"
    xtest_key = "sagemaker/sklearncontainer/xtest2.csv"
    ytrain_key = "sagemaker/sklearncontainer/ytrain2.csv"
    ytest_key = "sagemaker/sklearncontainer/ytest2.csv"

    sklearn = SKLearn(
        entry_point=train_path,
        role=role,
        instance_type="ml.m5.xlarge",
        sagemaker_session=sagemaker_session,
        framework_version="0.23-1",
        py_version="py3",
        hyperparameters={
            "bucket-name": bucket_name,
            "xtrain-key": xtrain_key,
            "xtest-key": xtest_key,
            "ytrain-key": ytrain_key,
            "ytest-key": ytest_key,
        },
    )

    sklearn.fit()

    predictor = sklearn.deploy(
        instance_type="ml.m5.xlarge",
        initial_instance_count=1,
        endpoint_name="random-forest-endpoint-1",
    )


# "ml.m5.xlarge" 4/16
# "ml.r5.8xlarge" 32/256
# "ml.m5.8xlarge" 32/128
# "ml.r5.24xlarge" 96/768


In [None]:
import argparse
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib
import boto3
from io import StringIO
import logging

logging.basicConfig(level=logging.INFO)

# reads train and test data from s3
def read_from_s3(bucket_name, file_name):
    s3 = boto3.client("s3")
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    return pd.read_csv(StringIO(obj["Body"].read().decode("utf-8")))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--bucket-name", type=str, required=True)
    parser.add_argument("--xtrain-key", type=str, required=True)
    parser.add_argument("--xtest-key", type=str, required=True)
    parser.add_argument("--ytrain-key", type=str, required=True)
    parser.add_argument("--ytest-key", type=str, required=True)

    args = parser.parse_args()

    X_train = read_from_s3(args.bucket_name, args.xtrain_key)
    X_test = read_from_s3(args.bucket_name, args.xtest_key)
    y_train = read_from_s3(args.bucket_name, args.ytrain_key)
    y_test = read_from_s3(args.bucket_name, args.ytest_key)

    model = RandomForestRegressor()

    model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    logging.info(f"Accuracy: {accuracy * 100:.2f}%")

    joblib.dump(model, "/opt/ml/model/model.joblib")
    logging.info("Model saved to /opt/ml/model/model.joblib")
