# AutoML

In [1]:
import sagemaker
import boto3
import os
import numpy as np
from sagemaker.automl.automl import AutoML


sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'real-estate'

role = sagemaker.get_execution_role()

In [2]:
data_dir = 'data'
train_key = os.path.join(prefix, 'train/train.csv')
test_key = os.path.join(prefix, 'test/test.csv')

In [None]:
# boto3.Session().resource('s3').Bucket(bucket).Object(train_key).upload_file(os.path.join(data_dir, 'clean_data_november.csv'))
# boto3.Session().resource('s3').Bucket(bucket).Object(test_key).upload_file(os.path.join(data_dir, 'clean_data_december.csv'))

In [29]:
# automl_job = AutoML(
#     role=role,
#     target_attribute_name='eur_price',
#     problem_type='Regression',
#     job_objective={'MetricName': 'MSE'},
#     max_candidates=250
# )

In [47]:
# s3_input_train = 's3://{}/{}/train'.format(bucket, prefix)

In [None]:
# automl_job.fit(inputs=s3_input_train)

.........

# Deploying model

In [3]:
client = boto3.client('sagemaker')
automl_job_name = client.list_auto_ml_jobs()['AutoMLJobSummaries'][0]['AutoMLJobName']
automl_job = AutoML.attach(auto_ml_job_name=automl_job_name, sagemaker_session=sagemaker_session)

# Building Predictor

In [4]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

In [5]:
predictor = automl_job.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    predictor_cls=Predictor,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer(),
)

---------------------!

# Testing the model

In [6]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from helpers import *

In [7]:
test_df = get_df_from_s3(bucket, test_key)
train_df = get_df_from_s3(bucket, train_key)

In [8]:
predicted_prices = np.array([float(prediction) for prediction in predictor.predict(test_df.iloc[:, 1:].to_csv(sep=',', header=False, index=False))[0]])
benchmark_prices = np.array(get_benchmark_prices_list(test_df, train_df))
actual_prices = test_df['eur_price'].values

In [18]:
if mean_squared_error(actual_prices, predicted_prices) < mean_squared_error(actual_prices, benchmark_prices):
    print(f'The model has a RMSE of {mean_squared_error(actual_prices, predicted_prices, squared=False)}, while the benchmark only obtains a RMSE of {mean_squared_error(actual_prices, benchmark_prices, squared=False)}.')

The model has a RMSE of 83969.07725973855, while the benchmark only obtains a RMSE of 212331.49665456844.


The model can be consumed [here](https://quizzical-brattain-4ffddb.netlify.app/).

In [17]:
# here are some values that can be used in the form to try it out

# {
#    "surface_area":50,
#    "rooms":2,
#    "bathrooms":2,
#    "max_floor":0,
#    "comfort":"comfort_1",
#    "floor_type":"first_floor",
#    "building_age":"before_1941",
#    "building_structure":"concrete_building_structure",
#    "neighborhood":"1_mai_area",
#    "decomandat":False,
#    "has_balconies":False,
#    "has_parking_spots_or_garages":False,
#    "has_floor_heating":False,
#    "only_district_heating":False,
#    "building_with_video_surveillance":False
# }

In [13]:
# predictor.delete_endpoint()