In [1]:
import os
import math
import sagemaker
from sagemaker.tensorflow import TensorFlow

sagemaker_session = sagemaker.Session()

bucket = 'calvinandpogs-ee148'

role = sagemaker.get_execution_role()
print(bucket, role)
print(os.getcwd())

calvinandpogs-ee148 arn:aws:iam::652516965730:role/service-role/AmazonSageMaker-ExecutionRole-20210513T011299
/home/ec2-user/SageMaker/atrw/clustering


# ATRW Clustering

In [5]:
estimator = TensorFlow(entry_point='clustering.py',
                        source_dir='./',
                        role=role,
                        instance_count=1,
                        instance_type="ml.g4dn.xlarge",
                        framework_version="2.2",
                        py_version="py37",
                        hyperparameters={
                            'output-s3': 's3://calvinandpogs-ee148/atrw/detection/annotations/clusters/vgg16places/pca-kmeans'
                        })

In [None]:
estimator.fit({'images': f's3://{bucket}/atrw/detection/train/'})

# LilaBC Clustering

## Full dataset

In [2]:
total_clusters = 61

estimator = TensorFlow(entry_point='clustering.py',
                        source_dir='./',
                        role=role,
                        instance_count=1,
                        instance_type="ml.g4dn.xlarge",
                        framework_version="2.2",
                        py_version="py37",
                        hyperparameters={
                            'output-s3': 's3://calvinandpogs-ee148/lilabc/clusters/full/',
                            'num-clusters': total_clusters
                        })

In [None]:
estimator.fit({'images': f's3://{bucket}/lilabc/images/labels-full/'})

## Fractional subsets

In [None]:
%%capture --no-stderr

total_clusters = 60        # To match the script-mistake
sets = [16, 8, 4, 2]
# sets = [16]

for set in sets:
    estimator = TensorFlow(entry_point='clustering.py',
                            source_dir='./',
                            role=role,
                            instance_count=1,
                            instance_type="ml.g4dn.xlarge",
                            framework_version="2.2",
                            py_version="py37",
                            hyperparameters={
                                'output-s3': f's3://calvinandpogs-ee148/lilabc/clusters/frac{set}/',
                                'num-clusters': math.ceil(float(total_clusters) / set),
                                'model-dataset': "places",
                                'dim-reduc': "umap",
                                'clustering-algo': "spectral"
                            })
    estimator.fit({'images': f's3://{bucket}/lilabc/images/labels-{set}/'})

## Experiments

In [5]:
# %%capture --no-stderr

# total_clusters = 60        # To match the script-mistake
# sets = [("full", 61), (16, 4), (8, 8), (4, 15), (2, 30)]
# sets = [(16, 4)]
# sets = [(8, 8), (4, 15)]
sets = [("full", 61)]

params = [
#     ("imagenet", "pca", "kmeans"),
#     ("places", "pca", "kmeans"), 
#     ("places", "umap", "kmeans"), 
#     ("places", "umap", "spectral"),
#     ("imagenet", "umap", "spectral"),
    ("places", "pca", "spectral")
]
    
for set, num_clusters in sets:
    for dataset, reduc, algo in params:
        exp_name = f"{dataset}_{reduc}_{algo}"
        estimator = TensorFlow(entry_point='clustering.py',
                                source_dir='./',
                                role=role,
                                instance_count=1,
                                instance_type="ml.g4dn.xlarge",
                                framework_version="2.2",
                                py_version="py37",
                                hyperparameters={
                                    'output-s3': f's3://calvinandpogs-ee148/lilabc/clusters/experiments/{exp_name}/clust-{set}/',
                                    'num-clusters': num_clusters,
                                    'model-dataset': dataset,
                                    'dim-reduc': reduc,
                                    'clustering-algo': algo
                                })
        estimator.fit({'images': f's3://{bucket}/lilabc/images/labels-{set}/'})

2021-06-05 22:31:35 Starting - Starting the training job...
2021-06-05 22:31:58 Starting - Launching requested ML instancesProfilerReport-1622932294: InProgress
......
2021-06-05 22:32:58 Starting - Preparing the instances for training......
2021-06-05 22:33:59 Downloading - Downloading input data......
2021-06-05 22:34:59 Training - Downloading the training image...
2021-06-05 22:35:21 Training - Training image download completed. Training in progress.[34m2021-06-05 22:35:21.900221: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[34m2021-06-05 22:35:21.903952: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:106] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2021-06-05 22:35:22.010325: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[34m2021-06-05 22:35:24,563 sagemaker-trai