# AutoML

In [1]:
import sagemaker
import boto3
import os
import numpy as np
from sagemaker.automl.automl import AutoML


sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'real-estate'

role = sagemaker.get_execution_role()

In [2]:
data_dir = 'data'
train_key = os.path.join(prefix, 'train/train.csv')
test_key = os.path.join(prefix, 'test/test.csv')

In [None]:
# boto3.Session().resource('s3').Bucket(bucket).Object(train_key).upload_file(os.path.join(data_dir, 'clean_data_november.csv'))
# boto3.Session().resource('s3').Bucket(bucket).Object(test_key).upload_file(os.path.join(data_dir, 'clean_data_december.csv'))

In [29]:
# automl_job = AutoML(
#     role=role,
#     target_attribute_name='eur_price',
#     problem_type='Regression',
#     job_objective={'MetricName': 'MSE'},
#     max_candidates=250
# )

In [47]:
# s3_input_train = 's3://{}/{}/train'.format(bucket, prefix)

In [None]:
# automl_job.fit(inputs=s3_input_train)

.........

# Deploying model

In [3]:
client = boto3.client('sagemaker')
automl_job_name = client.list_auto_ml_jobs()['AutoMLJobSummaries'][0]['AutoMLJobName']
automl_job = AutoML.attach(auto_ml_job_name=automl_job_name, sagemaker_session=sagemaker_session)

# Building Predictor

In [4]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

In [5]:
predictor = automl_job.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    predictor_cls=Predictor,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer(),
)

----------------------!

# Testing the model

In [6]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from helpers import *

In [7]:
test_df = get_df_from_s3(bucket, test_key)
train_df = get_df_from_s3(bucket, train_key)

In [8]:
predicted_prices = np.array([float(prediction) for prediction in predictor.predict(test_df.iloc[:, 1:].to_csv(sep=',', header=False, index=False))[0]])
benchmark_prices = np.array(get_benchmark_prices_list(test_df, train_df))
actual_prices = test_df['eur_price'].values

In [9]:
if mean_squared_error(actual_prices, predicted_prices) < mean_squared_error(actual_prices, benchmark_prices):
    print(f'The model has a MSE of {mean_squared_error(actual_prices, predicted_prices)}, while the benchmark only obtains a MSE of {mean_squared_error(actual_prices, benchmark_prices)}')

The model has a MSE of 7050806698.9046135, while the benchmark only obtains a MSE of 45084664471.56901


In [13]:
predictor.delete_endpoint()

In [10]:
predictor.endpoint_name

'sagemaker-sklearn-automl-2022-03-06-14-36-38-031'

In [9]:
COLUMNS = [
    'rooms', 'surface_area', 'bathrooms', 'partition_decomandat', 
    'partition_other', 'comfort_other', 'comfort_1', 'comfort_lux', 
    'max_floor', 'first_floor', 'last_floor', 'middle_floor',
    'bathrooms_ratio', 'after_2010', 'before_1941', 'between_1941_1977',
    'between_1977_1990', 'between_1990_2000', 'between_2000_2010',
    'not_finished', 'not_started', 'concrete_building_structure',
    'other_building_structure', 'unknown_building_structure',
    'has_balconies', 'has_parking_spots_or_garages', 'has_floor_heating',
    'only_district_heating', 'building_with_video_surveillance',
    '1_mai_area', 'agronomie_area', 'aviatiei_area', 'aviatorilor_area',
    'banu_manta_area', 'chibrit_area', 'domenii_area', 'dristor_area',
    'stefan_cel_mare_area', 'titulescu_area', 'turda_area'
]

In [1]:
req = {'rooms': 4.0,
 'surface_area': 178.4,
 'bathrooms': 3.0,
 'partition_decomandat': 1.0,
 'partition_other': 0.0,
 'comfort_other': 0.0,
 'comfort_1': 1.0,
 'comfort_lux': 0.0,
 'max_floor': 12.0,
 'first_floor': 0.0,
 'last_floor': 0.0,
 'middle_floor': 1.0,
 'bathrooms_ratio': 0.75,
 'after_2010': 1.0,
 'before_1941': 0.0,
 'between_1941_1977': 0.0,
 'between_1977_1990': 0.0,
 'between_1990_2000': 0.0,
 'between_2000_2010': 0.0,
 'not_finished': 0.0,
 'not_started': 0.0,
 'concrete_building_structure': 1.0,
 'other_building_structure': 0.0,
 'unknown_building_structure': 0.0,
 'has_balconies': 0.0,
 'has_parking_spots_or_garages': 0.0,
 'has_floor_heating': 0.0,
 'only_district_heating': 1.0,
 'building_with_video_surveillance': 0.0,
 '1_mai_area': 0.0,
 'agronomie_area': 0.0,
 'aviatiei_area': 0.0,
 'aviatorilor_area': 0.0,
 'banu_manta_area': 0.0,
 'chibrit_area': 0.0,
 'domenii_area': 0.0,
 'dristor_area': 0.0,
 'stefan_cel_mare_area': 1.0,
 'titulescu_area': 0.0,
 'turda_area': 0.0}

In [13]:
predictor.predict(pd.DataFrame(req, index=[0])[COLUMNS].to_csv(sep=',', header=False, index=False))[0]

['523945.5625']

In [14]:
runtime = boto3.Session().client('sagemaker-runtime')

In [15]:
df_apartment = pd.DataFrame(req, index=[0])[COLUMNS]

In [16]:
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType='text/csv',
    Body=df_apartment.to_csv(sep=',', header=False, index=False)
)

In [36]:
response

{'ResponseMetadata': {'RequestId': '75a19375-b158-426a-bcdb-f5855be6eb7f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '75a19375-b158-426a-bcdb-f5855be6eb7f',
   'x-amzn-invoked-production-variant': 'AllTraffic',
   'date': 'Wed, 09 Feb 2022 08:29:45 GMT',
   'content-type': 'text/csv; charset=utf-8',
   'content-length': '12'},
  'RetryAttempts': 0},
 'ContentType': 'text/csv; charset=utf-8',
 'InvokedProductionVariant': 'AllTraffic',
 'Body': <botocore.response.StreamingBody at 0x7f59862bff28>}

In [17]:
result = response['Body'].read().decode('utf-8')

In [18]:
str(round(float(result)))

'523946'

In [19]:
pd.DataFrame(req, index=[0])[COLUMNS]

Unnamed: 0,rooms,surface_area,bathrooms,partition_decomandat,partition_other,comfort_other,comfort_1,comfort_lux,max_floor,first_floor,...,agronomie_area,aviatiei_area,aviatorilor_area,banu_manta_area,chibrit_area,domenii_area,dristor_area,stefan_cel_mare_area,titulescu_area,turda_area
0,4.0,178.4,3.0,1.0,0.0,0.0,1.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
req.values()

dict_values([4.0, 178.4, 3.0, 1.0, 0.0, 0.0, 1.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.75, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0])

In [27]:
pd.DataFrame(req.items())

Unnamed: 0,0,1
0,rooms,4.0
1,surface_area,178.4
2,bathrooms,3.0
3,partition_decomandat,1.0
4,partition_other,0.0
5,comfort_other,0.0
6,comfort_1,1.0
7,comfort_lux,0.0
8,max_floor,12.0
9,first_floor,0.0
