In [1]:
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import boto3
import pandas as pd

# set up the S3 client
s3 = boto3.client('s3')

# specify the S3 bucket and file name
bucket_name = 'datasetsthesis2'
file_name = 'preprocessed_surge.csv'

# read the file from S3 into a Pandas DataFrame
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
merged = pd.read_csv(obj['Body'])

In [3]:
merged

Unnamed: 0,Day,Month,Hour,passenger_count,trip_distance,total_amount,temp,feelslike,snow,windspeed,cloudcover,duration,surge_multiplier
0,1,1,0,1.495620,2.704969,15.959061,3.500000,0.500000,0.0,25.200000,37.900000,36.947059,1.25
1,1,1,1,1.567708,5.595313,24.376823,3.500000,0.500000,0.0,25.200000,37.900000,68.396875,1.00
2,1,1,2,1.305732,3.114076,18.019427,3.500000,0.500000,0.0,25.200000,37.900000,151.373355,1.00
3,1,1,3,1.566667,3.220810,17.525452,3.500000,0.500000,0.0,25.200000,37.900000,211.571984,1.25
4,1,1,4,1.542308,3.626808,18.821538,3.500000,0.500000,0.0,25.200000,37.900000,267.962692,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3256,31,3,19,1.333141,2.348576,16.367994,13.298386,13.297983,0.0,15.008069,69.360231,1170.938943,1.75
3257,31,3,20,1.325161,3.512310,18.635200,13.292774,13.290968,0.0,15.036129,69.221935,1229.775785,1.25
3258,31,3,21,1.379863,3.260870,18.066018,13.287185,13.283982,0.0,15.064073,69.084211,1294.837262,1.25
3259,31,3,22,1.438695,3.140484,17.483836,13.293701,13.292126,0.0,15.031496,69.244769,1355.028965,1.25


In [4]:
X = merged.drop(["surge_multiplier"], axis=1)
y = merged["surge_multiplier"]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(merged.drop("surge_multiplier", axis=1), merged["surge_multiplier"], test_size=0.2)

In [6]:
# Get all the attributes (columns) from the "merged" dataframe
attributes = merged.columns.tolist()

# Print the list of attributes
print(attributes)

['Day', 'Month', 'Hour', 'passenger_count', 'trip_distance', 'total_amount', 'temp', 'feelslike', 'snow', 'windspeed', 'cloudcover', 'duration', 'surge_multiplier']


In [7]:
trainX = pd.DataFrame(X_train, columns=attributes)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=attributes)
testX['target'] = y_test

In [8]:
trainX.to_csv('surge_train.csv')
testX.to_csv('surge_test.csv')

# Data Ingestion

In [9]:
import datetime
import tarfile

import boto3 # AWS SDK for python. Provides low-level access to AWS services
from sagemaker import get_execution_role
import sagemaker

m_boto3 = boto3.client('sagemaker') 

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  #  Bucket is a logical unit of storage in AWS S3

print('Using bucket ' + bucket)

Using bucket sagemaker-eu-north-1-378412049928


In [10]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path='surge_train.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

testpath = sess.upload_data(
    path='surge_test.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

# Prepare a Scikit-learn Training Script

In [11]:
pip install --upgrade scikit-learn

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.0
    Uninstalling scikit-learn-1.2.0:
      Successfully uninstalled scikit-learn-1.2.0
Successfully installed scikit-learn-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install joblib

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [23]:
%%writefile script.py

import argparse
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

# inference functions ---------------
def model_fn(model_dir):
    with open(os.path.join(model_dir, 'model.pkl'), 'rb') as f:
        clf = pickle.load(f)
    return clf

if __name__ == '__main__':
    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument('--n-estimators', type=int, default=100)
    parser.add_argument('--max_leaf_nodes', type=int, default=10)

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='surge_train.csv')
    parser.add_argument('--test-file', type=str, default='surge_test.csv')

    args, _ = parser.parse_known_args()

    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    #train_df = pd.read_csv('surge_train.csv')
    #test_df = pd.read_csv('surge_test.csv')
    
    print('building training and testing datasets')
   # numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
    #preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, train_df.columns.tolist())])
    attributes = ['Day', 'Month', 'Hour', 'passenger_count', 'trip_distance', 'total_amount', 'temp', 'feelslike', 'snow', 'windspeed', 'cloudcover', 'duration']

    X_train =train_df[attributes]
    X_test = test_df[attributes]
    y_train = train_df['target']
    y_test = test_df['target']

    # train
    print('training model')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    # persist model
    path = os.path.join(args.model_dir, "model.pkl")
    with open(path, 'wb') as f:
        pickle.dump(model, f)
    print('model persisted at ' + path)

    # print explained_variance_score
    print('validating model')
    predictions = model.predict(X_test)
    print("R2 score : %.2f" % r2_score(y_test, predictions))




Overwriting script.py


In [24]:
! python script.py --n-estimators 500 \
                   --max-leaf-nodes 16 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

extracting arguments
reading data
building training and testing datasets
training model
model persisted at ./model.pkl
validating model
R2 score : 0.88


In [25]:
pip install --upgrade sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.145.0.tar.gz (714 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m714.1/714.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.145.0-py2.py3-none-any.whl size=959452 sha256=8bb9eeb7b32f21f7a6dfbeb4a31623d776c932a730276b2e20fef8a3b86a9f2d
  Stored in directory: /home/ec2-user/.cache/pip/wheels/49/f4/2c/48f91efe2535c9aad0fa36bc41c85dff48e722b67ca89771c0
Successfully built sagemaker
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.132.0
    Uninstalling sagemaker-2.132.0:
      Successfully uninstalled sagemaker-2.132.0
Successfully installed 

In [28]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role=get_execution_role(),
    instance_count=1,
    instance_type='ml.c5.2xlarge',  # Updated instance type
    framework_version='0.23-1',
    base_job_name='rf-scikit',
    hyperparameters={'n-estimators': 500,
                     'max_leaf_nodes': 16})


In [29]:
sklearn_estimator.fit({'train':trainpath, 'test': testpath}, wait=False)

INFO:sagemaker:Creating training-job with name: rf-scikit-2023-04-07-18-47-47-849


In [30]:
sklearn_estimator.latest_training_job.wait(logs='None')
artifact = m_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)


2023-04-07 18:47:48 Starting - Starting the training job....
2023-04-07 18:48:16 Starting - Preparing the instances for training........
2023-04-07 18:49:03 Downloading - Downloading input data....
2023-04-07 18:49:28 Training - Downloading the training image
2023-04-07 18:49:34 Training - Training image download completed. Training in progress.....
2023-04-07 18:49:54 Uploading - Uploading generated training model.
2023-04-07 18:50:05 Completed - Training job completed
Model artifact persisted at s3://sagemaker-eu-north-1-378412049928/rf-scikit-2023-04-07-18-47-47-849/output/model.tar.gz


In [31]:
import boto3
sm_client = boto3.client('sagemaker')
endpoints = sm_client.list_endpoints()
endpoint_configs = sm_client.list_endpoint_configs()

In [32]:
from sagemaker.predictor import csv_serializer
predictor = sklearn_estimator.deploy(instance_type='ml.c5.2xlarge',initial_instance_count=1, serializer=csv_serializer)

INFO:sagemaker:Creating model with name: rf-scikit-2023-04-07-18-50-26-961
INFO:sagemaker:Creating endpoint-config with name rf-scikit-2023-04-07-18-50-26-961
INFO:sagemaker:Creating endpoint with name rf-scikit-2023-04-07-18-50-26-961


----!

In [33]:
attributes = ['Day', 'Month', 'Hour', 'passenger_count', 'trip_distance', 'total_amount', 'temp', 'feelslike', 'snow', 'windspeed', 'cloudcover', 'duration']

predictor.predict(testX[attributes].values)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


array([1.0075, 1.9225, 1.2475, 1.1525, 1.7825, 1.8425, 1.5375, 1.745 ,
       1.4575, 1.19  , 1.1875, 1.03  , 1.575 , 1.5   , 1.4775, 1.42  ,
       1.1525, 1.775 , 1.5125, 1.6675, 1.7525, 1.7   , 1.6   , 1.3575,
       1.51  , 1.2225, 1.5925, 1.    , 1.3825, 1.    , 1.0075, 1.0025,
       1.935 , 1.4825, 1.7   , 1.965 , 1.9325, 1.025 , 1.0025, 1.48  ,
       1.8075, 1.5775, 1.4525, 1.9625, 1.9025, 1.4875, 1.02  , 1.885 ,
       1.075 , 1.9625, 1.1725, 1.225 , 1.125 , 1.5175, 1.545 , 1.36  ,
       1.385 , 1.    , 1.91  , 1.0025, 1.475 , 1.6   , 1.58  , 1.9   ,
       1.29  , 1.335 , 1.9375, 1.5625, 1.605 , 1.7925, 1.5875, 1.4675,
       1.92  , 1.795 , 1.6575, 1.5025, 1.8475, 1.1875, 1.0375, 1.215 ,
       1.575 , 1.5425, 1.01  , 1.6075, 1.46  , 1.    , 1.    , 1.01  ,
       1.    , 1.525 , 1.28  , 1.0775, 1.9425, 1.08  , 1.435 , 1.3725,
       1.2425, 1.0075, 1.9225, 1.6125, 1.6675, 1.8   , 1.5225, 1.565 ,
       1.1675, 1.7475, 1.25  , 1.4875, 1.015 , 1.8125, 1.405 , 1.305 ,
      

In [54]:
import boto3
import json

runtime = boto3.client("sagemaker-runtime")

endpoint_name = "rf-scikit-2023-04-07-18-50-26-961"
content_type = "application/json"
payload={
    "input": [1, 1, 0, 1.495619524, 2.704968711, 15.95906133, 3.5, 0.5, 0, 25.2, 37.9, 36.94705882]
}

try:
    response = runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType=content_type,
        Body=json.dumps(payload)
    )
    print(response.predict(payload).decode('utf-8'))
except Exception as e:
    print("Error: {}".format(str(e)))


Error: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://eu-north-1.console.aws.amazon.com/cloudwatch/home?region=eu-north-1#logEventViewer:group=/aws/sagemaker/Endpoints/rf-scikit-2023-04-07-18-50-26-961 in account 378412049928 for more information.
