In [1]:
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'bucket-sagemaker-ml'
print("Using bucket " + bucket)
print("Current Region " + region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using bucket bucket-sagemaker-ml
Current Region us-east-1


In [2]:
df = pd.read_csv("network_simulation_data_new.csv")

In [3]:
df.head()

Unnamed: 0,Day_Type,Hour,Area,RU_id,RU_type,Capacity,RU_x,RU_y,Current_load,Load_Percentage,RU_Power
0,Weekday,0,Office,2339242788048,macro,200,500,1500,0.0,0.0,Idle
1,Weekday,0,Office,2339322711696,macro,200,500,3750,10.0,5.0,Reduced
2,Weekday,0,Office,2339341123408,macro,200,1250,500,5.0,2.5,Reduced
3,Weekday,0,Office,2339322764688,macro,200,1250,2500,21.0,10.5,Reduced
4,Weekday,0,Office,2339322764944,macro,200,2000,1500,24.0,12.0,Reduced


In [4]:
df.shape

(28080, 11)

In [5]:
label_encoder = LabelEncoder()
df['Day_Type'] = df['Day_Type'].map({'Weekday': 0, 'Weekend': 1})
df['Area'] = df['Area'].map({'Office': 0, 'Residential': 1, 'Garden_Mall': 2})
df['RU_type'] = df['RU_type'].map({'macro': 0, 'micro': 1})
df['RU_Power'] = df['RU_Power'].map({'Idle': 0, 'Reduced': 1, 'Full': 2})
df = df.drop(columns=['RU_id', 'RU_x', 'RU_y'])
print(df.head())

   Day_Type  Hour  Area  RU_type  Capacity  Current_load  Load_Percentage  \
0         0     0     0        0       200           0.0              0.0   
1         0     0     0        0       200          10.0              5.0   
2         0     0     0        0       200           5.0              2.5   
3         0     0     0        0       200          21.0             10.5   
4         0     0     0        0       200          24.0             12.0   

   RU_Power  
0         0  
1         1  
2         1  
3         1  
4         1  


In [6]:
df.columns

Index(['Day_Type', 'Hour', 'Area', 'RU_type', 'Capacity', 'Current_load',
       'Load_Percentage', 'RU_Power'],
      dtype='object')

In [7]:
df.shape

(28080, 8)

In [8]:
df.isnull().mean() * 100

Day_Type           0.0
Hour               0.0
Area               0.0
RU_type            0.0
Capacity           0.0
Current_load       0.0
Load_Percentage    0.0
RU_Power           0.0
dtype: float64

In [9]:
features = list(df.columns)
features

['Day_Type',
 'Hour',
 'Area',
 'RU_type',
 'Capacity',
 'Current_load',
 'Load_Percentage',
 'RU_Power']

In [10]:
label = features.pop(-1)
print(label)

RU_Power


In [11]:
x = df[features].copy()
y = df[label]

In [12]:
x.head()

Unnamed: 0,Day_Type,Hour,Area,RU_type,Capacity,Current_load,Load_Percentage
0,0,0,0,0,200,0.0,0.0
1,0,0,0,0,200,10.0,5.0
2,0,0,0,0,200,5.0,2.5
3,0,0,0,0,200,21.0,10.5
4,0,0,0,0,200,24.0,12.0


In [13]:
y.head()

0    0
1    1
2    1
3    1
4    1
Name: RU_Power, dtype: int64

In [14]:
x.shape

(28080, 7)

In [15]:
y.value_counts()

1    10804
2     9374
0     7902
Name: RU_Power, dtype: int64

In [16]:
x['Current_load'] += np.random.normal(0, 0.5, size=len(x)) 
x['Load_Percentage'] += np.random.normal(0, 0.5, size=len(x))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0, shuffle=True)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22464, 7)
(5616, 7)
(22464,)
(5616,)


In [21]:
# !pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4


In [19]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

trainX = pd.DataFrame(X_train_resampled, columns=features)
trainX[label] = y_train_resampled

testX = pd.DataFrame(X_test, columns=features)
testX[label] = y_test

In [20]:
print(trainX.shape)
print(testX.shape)

(26037, 8)
(5616, 8)


In [21]:
testX.head()

Unnamed: 0,Day_Type,Hour,Area,RU_type,Capacity,Current_load,Load_Percentage,RU_Power
6692,0,3,0,1,150,0.596518,-0.037319,0
20271,0,15,1,1,150,8.294943,6.116588,1
24093,0,17,1,1,150,42.143056,28.387529,1
17755,0,23,1,0,200,146.673222,73.421599,2
11934,1,18,0,0,200,13.321953,6.59037,1


In [22]:
trainX.isnull().sum()

Day_Type           0
Hour               0
Area               0
RU_type            0
Capacity           0
Current_load       0
Load_Percentage    0
RU_Power           0
dtype: int64

In [23]:
testX.isnull().sum()

Day_Type           0
Hour               0
Area               0
RU_type            0
Capacity           0
Current_load       0
Load_Percentage    0
RU_Power           0
dtype: int64

In [24]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [25]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/RU_Power_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

In [26]:
testpath

's3://bucket-sagemaker-ml/sagemaker/RU_Power_classification/sklearncontainer/test-V-1.csv'

In [27]:
trainpath

's3://bucket-sagemaker-ml/sagemaker/RU_Power_classification/sklearncontainer/train-V-1.csv'

In [28]:
%%writefile script.py


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf      
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (80%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (20%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc*100)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Writing script.py


In [29]:
! python script.py --n_estimators 100 \
                   --random_state 0 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

[INFO] Extracting arguments
SKLearn Version:  1.2.1
Joblib Version:  1.4.2
[INFO] Reading data

Building training and testing datasets

Column order: 
['Day_Type', 'Hour', 'Area', 'RU_type', 'Capacity', 'Current_load', 'Load_Percentage']

Label column is:  RU_Power

Data Shape: 

---- SHAPE OF TRAINING DATA (80%) ----
(26037, 7)
(26037,)

---- SHAPE OF TESTING DATA (20%) ----
(5616, 7)
(5616,)

Training RandomForest Model.....

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21

In [30]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.2-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [31]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: custom-sklearn-2024-11-28-12-34-20-279


2024-11-28 12:34:23 Starting - Starting the training job...
2024-11-28 12:34:38 Starting - Preparing the instances for training...
2024-11-28 12:35:03 Downloading - Downloading input data...
2024-11-28 12:35:28 Downloading - Downloading the training image...
2024-11-28 12:36:24 Training - Training image download completed. Training in progress...[34m2024-11-28 12:36:28,955 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-11-28 12:36:28,959 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-11-28 12:36:28,961 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-11-28 12:36:28,976 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-11-28 12:36:29,156 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-11-28 12:36:29,159 sagemaker-training-toolkit INFO     No N


2024-11-28 12:36:47 Uploading - Uploading generated training model
2024-11-28 12:36:47 Completed - Training job completed
Training seconds: 104
Billable seconds: 34
Managed Spot Training savings: 67.3%


In [32]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-11-28 12:36:47 Starting - Preparing the instances for training
2024-11-28 12:36:47 Downloading - Downloading the training image
2024-11-28 12:36:47 Training - Training image download completed. Training in progress.
2024-11-28 12:36:47 Uploading - Uploading generated training model
2024-11-28 12:36:47 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-423623834574/custom-sklearn-2024-11-28-12-34-20-279/output/model.tar.gz


In [33]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [34]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2024-11-28-12-38-00


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-11-28-12-37-49
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-11-28-12-38-00
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-11-28-12-38-00


------!