In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
import sagemaker
import boto3
from sagemaker import Session
import io
import sagemaker.amazon.common as smac
import os
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

In [None]:
'''Again, since we're working with the same dataset, let's create the same coding we did earlier, but this time with some modifications'''

In [5]:
df = pd.read_csv('house_prices.csv')

In [6]:
# todo CLEAN UP DATA --------------------

In [7]:
df.drop(columns=['id', 'date', 'sqft_living15', 'sqft_lot15'], inplace=True)

In [8]:
# todo ATTRIBUTES SELECTION --------------------

In [9]:
training = df.iloc[0:15129, :]
test = df.iloc[15129:, :]
X_test = test.iloc[:, 1:17].values
y_test = test.iloc[:, 0].values

training.to_csv('house_prices_train_xgboost.csv', header=False, index=False)
test.to_csv('house_prices_test_xgboost.csv', header=False, index=False)

In [18]:
session = sagemaker.Session()
bucket = 'tests-aws-sagemaker'
model = 'house_prices_xgboost'
dataset = 'house_prices'
key_train = 'houses-train-data-xgboost'
key_test = 'houses-test-data-xgboost'
role = sagemaker.get_execution_role()

s3_train_data = 's3://{}/{}/train/{}'.format(bucket, dataset, key_train) # 's3://tests-aws-sagemaker/house_prices/train/houses-train-data-xgboost'
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, dataset, key_test) # 's3://tests-aws-sagemaker/house_prices_xgboost/test/houses-test-data-xgboost'
output_location = 's3://{}/{}/output'.format(bucket, model) # 's3://tests-aws-sagemaker/house_prices_xgboost/output'

container = sagemaker.image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='latest')

In [19]:
with open('house_prices_train_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(dataset, 'train', key_train)).upload_fileobj(f)
          
with open('house_prices_test_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(dataset, 'test', key_test)).upload_fileobj(f)

In [20]:
# todo TUNING --------------------

In [None]:
'''At this part we can start to set some parameters for the tuning, rounds, etc. I made a list for quick comprehension'''

In [21]:
# ETA = PREVENT OVERFITTING
# ALPHA = ADD EXTRA WEIGHTS TO HELP KEEP THE MODEL MORE CONSERVATIVE
# MIN CHILD WEIGHT = MINIMAL NUMBER OF INSTANCES REQUIRED TO BE CONTRUCTED IN THE DECISION TREE
# MAX_DEPTH = MAX DEPTH OF THE TREE, WHICH MAKES THE MODEL MORE COMPLEX, AND MORE LIKELY TO OVERFIT
# NUM_ROUND = NUMBER OF ROUNDS TO RUN THE TRAINING
# RESOURCELIMITS = MAXIMUM OF TRAINING JOBS AND HOW MANY PARALEL ALGORITHM TRANINGS IT SHOULD RUN AT THE SAME TIME
# STRATEGY = BAYESIAN FOR PARAMETERS DEFINITION
# HYPERPARAMETERTUNINJOBOBJECTIVE = WHAT KIND OF METRIC WE WANT TO CHECK AND WHAT WOULD BE THE GOAL

In [22]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "300",
          "MinValue": "50",
          "Name": "num_round"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 9,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:rmse",
      "Type": "Minimize"
    }
  }

In [None]:
'''And now we cset some parameters for the training, paths, metrics,etc. Here's another list of parameters for quick compreension'''

In [23]:
# ALGORITHMSPECIFICATION = THE IMAGE WE WANT TO USE, AND THE METHOD
# INPUTDATACONFIG = THE PATH, NAME AND SOURCE OF THE DATA WE WILL BE USING
# OUTPUTDATACONFIG = THE PATH WE WANT TO SAVE THE TRAINING
# RESOURCECONFIG = NUMBER OF INSTANCES, TYPE OF INSTANCE AND VOLUME SIZE
# ROLEARN = ROLE ACCESS
# STATICHYPERPARAMETERS = KIND OF METRICS, REG:LINEAR FOR LINEAR REGRESSION, DROPOUT TO AVOID OVERFITTING AND TWEEDIE TO CONTROL THE VARIANCE
# STOPPINGCONDITION = MAX TIME WE WANT THE ALGORITHM TO BE FULLY EXECUTED

In [24]:
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": container,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_train_data
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_test_data
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket, model)
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "rmse",
      "objective": "reg:linear",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [None]:
'''Now we can start the tuning tests and check which one of the rounds had the better result, we're taking a look at that at Amazon 

In [26]:
smclient = boto3.client('sagemaker')
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName="xgboosttuninghouses2", HyperParameterTuningJobConfig=tuning_job_config,
                                          TrainingJobDefinition=training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-west-2:217214575618:hyper-parameter-tuning-job/xgboosttuninghouses2',
 'ResponseMetadata': {'RequestId': '91c6d208-3e59-4066-b61c-712184e08894',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '91c6d208-3e59-4066-b61c-712184e08894',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '121',
   'date': 'Sat, 10 Sep 2022 21:49:53 GMT'},
  'RetryAttempts': 0}}

In [None]:
# PRINT 13

In [None]:
'''Now that we have the best parameters set up we can create the best model with the parameters that were chosen'''

In [27]:
# CREATION WITH BEST PARAMETERS

In [28]:
container = sagemaker.image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='latest')
xgboost_tuning = sagemaker.estimator.Estimator(image_uri=container, role=role, instance_count=1, instance_type='ml.m5.2xlarge',
                                               output_path=output_location, sagemaker_session=session)

xgboost_tuning.set_hyperparameters(num_round = 215, eta = 0.07545286994225804, min_child_weight = 2.4061755279241996,
                                   alpha = 1.5934054040797325, tweedie_variance_power = 1.4, rate_drop = 0.3)

In [None]:
'''With everything ready, let's train the best parameter model'''

In [30]:
train_input = sagemaker.inputs.TrainingInput(s3_data=s3_train_data, content_type='csv', s3_data_type='S3Prefix')
test_input = sagemaker.inputs.TrainingInput(s3_data=s3_test_data, content_type='csv', s3_data_type='S3Prefix')
data_channels = {'train': train_input, 'validation': test_input}
xgboost_tuning.fit(data_channels)

2022-09-10 22:22:33 Starting - Starting the training job...
2022-09-10 22:22:57 Starting - Preparing the instances for trainingProfilerReport-1662848553: InProgress
......
2022-09-10 22:23:57 Downloading - Downloading input data...
2022-09-10 22:24:27 Training - Downloading the training image.....[34mArguments: train[0m
[34m[2022-09-10:22:25:10:INFO] Running standalone xgboost training.[0m
[34m[2022-09-10:22:25:10:INFO] File size need to be processed in the node: 1.55mb. Available memory size in the node: 23858.36mb[0m
[34m[2022-09-10:22:25:10:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:25:10] S3DistributionType set as FullyReplicated[0m
[34m[22:25:11] 15129x16 matrix with 242064 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-09-10:22:25:11:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:25:11] S3DistributionType set as FullyReplicated[0m
[34m[22:25:11] 6484x16 matrix with 103744 entries loaded fr

In [None]:
# PRINT 14

In [None]:
'''Let's deploy the model and convert from binary to numpy array, and check the results'''

In [31]:
xgboost_regressor_tuning = xgboost_tuning.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

-------!

In [32]:
xgboost_regressor_tuning.serializer = CSVSerializer()
predictions = np.array(xgboost_regressor_tuning.predict(X_test).decode('utf-8').split(',')).astype(np.float32)

In [33]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = math.sqrt(mean_squared_error(y_test, predictions))
print(f'MAE: {mae} | MSE:{mse} | RMSE{rmse}')

MAE: 67139.43489623496 | MSE:14512797938.180792 | RMSE120469.07461328318


In [None]:
'''We can compare the results and we'll find that the tuning helped a litle our model. For better results we can try more rounds and test more of those parameters, so we can obtain
better results, keep in mind that the no tuning had 100 rounds for test, and the tuning had only 9. With this info we can crank up the number of tries on the tuning so we can reach
even better results.

In [None]:
'''Don't forget to delete your endpoints, s3 buckets and '''

In [None]:
# xgboost_regressor_tuning.delete_endpoint()