In [1]:

import pandas as pd
import numpy as np
from datetime import datetime

import boto3
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

In [2]:

role = get_execution_role()
bucket = 'ml-lab-ufo-elly'
prefix = 'ufo_dataset'
data_key = 'ufodata.csv'
data_location = 's3://{}/{}/{}'.format(bucket, prefix, data_key)

df = pd.read_csv(data_location, low_memory=False)

In [3]:
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,3/31/1977,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,11/15/1982,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,12/7/1992,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2/21/2011,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,3/9/1991,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [4]:
df.shape

(18000, 15)

# Step2: Cleaning. transforming, and preparing the data

In [5]:
df_geo = df[['latitude','longitude']]

In [6]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
latitude     18000 non-null float64
longitude    18000 non-null float64
dtypes: float64(2)
memory usage: 281.3 KB


In [7]:
# transform the dataframe to numpy

In [8]:
data_train = df_geo.values.astype('float32')
data_train

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

# step3: Create and train model

In [9]:
from sagemaker import KMeans

num_cluster = 10
output_location = 's3://'+ bucket +'/mdole-artifacts'

km = KMeans(role = role, 
           train_instance_count = 1,
           train_instance_type ='ml.c4.xlarge',
           output_path = output_location,
           k = num_cluster)

In [10]:
job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M"))
print('Here is the job name{}'.format(job_name))

Here is the job namekmeans-geo-job-202007072338


In [12]:
%%time
km.fit(km.record_set(data_train), job_name=job_name)

2020-07-07 23:39:13 Starting - Starting the training job...
2020-07-07 23:39:15 Starting - Launching requested ML instances......
2020-07-07 23:40:32 Starting - Preparing the instances for training......
2020-07-07 23:41:42 Downloading - Downloading input data...
2020-07-07 23:42:09 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/07/2020 23:42:28 INFO 140639915898688] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_cente

In [17]:
model_key

'model-artifacts/kmeans-geo-job-202007072338/output/model.tar.gz'

In [18]:
bucket

'ml-lab-ufo-elly'

In [20]:
import os

model_key ='mdole-artifacts/'+ job_name + '/output/model.tar.gz'

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

2304

In [21]:
!pip install mxnet

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/81/f5/d79b5b40735086ff1100c680703e0f3efc830fa455e268e9e96f3c857e93/mxnet-1.6.0-py2.py3-none-any.whl (68.7MB)
[K    100% |████████████████████████████████| 68.7MB 739kB/s eta 0:00:01
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Collecting numpy<2.0.0,>1.16.0 (from mxnet)
[?25l  Downloading https://files.pythonhosted.org/packages/00/16/476826a84d545424084499763248abbbdc73d065168efed9aa71cdf2a7dc/numpy-1.19.0-cp36-cp36m-manylinux1_x86_64.whl (13.5MB)
[K    100% |████████████████████████████████| 13.5MB 4.0MB/s eta 0:00:01
Installing collected packages: graphviz, numpy, mxnet
  Found existing installation: numpy 1.14.3
    Uninstalling numpy-1.14.3:
      Successfully uninstalled numpy-1.14.3
Successfully installed graphviz-0.8.4 mxnet-1.6.0 numpy-1.19.0


In [22]:
import mxnet as mx
Kmeans_model_params = mx.ndarray.load('model_algo-1')

In [24]:
cluster_centroids_kmeans = pd.DataFrame(Kmeans_model_params[0].asnumpy())
cluster_centroids_kmeans.columns=df_geo.columns
cluster_centroids_kmeans

Unnamed: 0,latitude,longitude
0,35.263401,-118.315224
1,45.534275,20.146706
2,41.298523,-87.222313
3,-6.646294,121.190048
4,35.381603,-97.987976
5,47.733742,-122.601151
6,52.427032,-1.940575
7,41.415337,-74.884354
8,31.535656,-82.130348
9,-23.400616,-11.324432


In [25]:
from io import StringIO

In [29]:
csv_buffer = StringIO()
cluster_centroids_kmeans.to_csv(csv_buffer, index = False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'results/ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '07ECFC5B642294C7',
  'HostId': 'YygLvCRT/J4AZUTEzriktCLl5t3eQp2eFtxqFyho/5TGGH7kJESlzDc3TJpNbDw3//4QdMznkTg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'YygLvCRT/J4AZUTEzriktCLl5t3eQp2eFtxqFyho/5TGGH7kJESlzDc3TJpNbDw3//4QdMznkTg=',
   'x-amz-request-id': '07ECFC5B642294C7',
   'date': 'Wed, 08 Jul 2020 00:00:10 GMT',
   'etag': '"04bdaf2b43f4cb94cd115e1595e73101"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"04bdaf2b43f4cb94cd115e1595e73101"'}