# Step1: Uploading ufo data from S3

In [1]:

import pandas as pd
import numpy as np
from datetime import datetime

import boto3
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

In [2]:

role = get_execution_role()
bucket = 'ml-lab-ufo-elly'
prefix = 'ufo_dataset'
data_key = 'ufodata.csv'
data_location = 's3://{}/{}/{}'.format(bucket, prefix, data_key)

df = pd.read_csv(data_location, low_memory=False)

In [3]:

role = get_execution_role()
bucket = 'ml-lab-ufo-elly'
prefix = 'ufo_dataset'
data_key = 'ufodata.csv'
data_location = 's3://{}/{}/{}'.format(bucket, prefix, data_key)

df = pd.read_csv(data_location, low_memory=False)

In [4]:
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,3/31/1977,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,11/15/1982,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,12/7/1992,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2/21/2011,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,3/9/1991,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [5]:
df.shape

(18000, 15)

# Step2: Cleaning. transforming, and preparing the data

In [6]:
df_geo = df[['latitude','longitude']]

In [7]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   18000 non-null  float64
 1   longitude  18000 non-null  float64
dtypes: float64(2)
memory usage: 281.4 KB


In [8]:
# transform the dataframe to numpy

In [9]:
data_train = df_geo.values.astype('float32')
data_train

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

# step3: Create and train model

In [10]:
from sagemaker import KMeans

num_cluster = 10
output_location = 's3://'+ bucket +'/mdole-artifacts'

km = KMeans(role = role, 
           train_instance_count = 1,
           train_instance_type ='ml.c4.xlarge',
           output_path = output_location,
           k = num_cluster)

In [11]:
job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M"))
print('Here is the job name{}'.format(job_name))

Here is the job namekmeans-geo-job-202008100344


In [12]:
%%time
km.fit(km.record_set(data_train), job_name=job_name)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-08-10 03:44:43 Starting - Starting the training job...
2020-08-10 03:44:45 Starting - Launching requested ML instances.........
2020-08-10 03:46:29 Starting - Preparing the instances for training......
2020-08-10 03:47:28 Downloading - Downloading input data...
2020-08-10 03:48:01 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/10/2020 03:48:24 INFO 139947675027264] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_ce

In [14]:
bucket

'ml-lab-ufo-elly'

In [15]:
import os

model_key ='mdole-artifacts/'+ job_name + '/output/model.tar.gz'

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

2304

In [16]:
!pip install mxnet

Collecting mxnet
  Downloading mxnet-1.6.0-py2.py3-none-any.whl (68.7 MB)
[K     |████████████████████████████████| 68.7 MB 74 kB/s s eta 0:00:01
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
Successfully installed graphviz-0.8.4 mxnet-1.6.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [17]:
import mxnet as mx
Kmeans_model_params = mx.ndarray.load('model_algo-1')

In [18]:
cluster_centroids_kmeans = pd.DataFrame(Kmeans_model_params[0].asnumpy())
cluster_centroids_kmeans.columns=df_geo.columns
cluster_centroids_kmeans

Unnamed: 0,latitude,longitude
0,34.653027,-99.118713
1,23.332932,28.863106
2,40.512184,-87.938286
3,-32.081528,146.835007
4,47.496132,-121.987595
5,35.16293,-118.618706
6,51.975769,-0.152199
7,30.63656,-81.771118
8,41.355423,-75.137962
9,17.875875,95.769745


In [19]:
from io import StringIO

In [20]:
csv_buffer = StringIO()
cluster_centroids_kmeans.to_csv(csv_buffer, index = False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'results/ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'DC868748CF81CB2B',
  'HostId': 'msi4MQ2LTo5VHiQOvjlXr/F30MZW/fd9xXp4v5zX+nthkjKN8/UqS+6hNcVe1kbV4GOC9JxyqRI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'msi4MQ2LTo5VHiQOvjlXr/F30MZW/fd9xXp4v5zX+nthkjKN8/UqS+6hNcVe1kbV4GOC9JxyqRI=',
   'x-amz-request-id': 'DC868748CF81CB2B',
   'date': 'Mon, 10 Aug 2020 04:04:34 GMT',
   'etag': '"e34ac6f9ec5cd68495acaa44003b3552"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"e34ac6f9ec5cd68495acaa44003b3552"'}