In [None]:
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker.estimator import Estimator

use_spot_instances = True
max_run = 3600 #In seconds
max_wait = 7200 if use_spot_instances else None

#This prepares a payload to start a SageMaker Training job
#It works exactly the same as the web interface for making a job
#It just uses entry_point to spcify which training script to run
#Note that no docker image is needed here (image_uri) because SageMaker already has all the packages needed to run the code
#It only is up to date for tensor flow version 2.14 (framework_version)
#The checkpoint system does not seem to be working and I'm pretty sure it's because I didn't set up custom callbacks in the keras model
#Some of these features get passed into the code itself as environment variables such as output_path
resnet = TensorFlow(entry_point="KerasModel.py",
                       role=sagemaker.get_execution_role(),
                       instance_count=1,
                       instance_type='ml.g4dn.xlarge',
                       framework_version="2.14",
                       py_version="py310",
                       output_path = 's3://summer2024-sagemaker-data-bucket/modelArtifacts/TestModelArtifact/',
                       checkpoint_s3_uri = 's3://summer2024-sagemaker-data-bucket/checkpoints/',
                       use_spot_instances=use_spot_instances,
                       max_run=max_run,
                       max_wait=max_wait
                   )

#This actually allocates an instance and starts the SageMaker Training job. 
#The items here are passed in as arguments to the training file
#There is a corresponding AWS console command to this (Something like python train-Copy1.py --training s3://...)
#Note that all the files specified here get copied into a directory local to the training job
resnet.fit({'training': 's3://summer2024-sagemaker-data-bucket/train/images/',
               'labels': 's3://summer2024-sagemaker-data-bucket/train/train_lst.lst'})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2024-08-19-20-58-59-112


2024-08-19 20:58:59 Starting - Starting the training job...
2024-08-19 20:59:15 Starting - Preparing the instances for training...
2024-08-19 20:59:46 Downloading - Downloading input data......
2024-08-19 21:00:37 Downloading - Downloading the training image.....................
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-08-19 21:04:14.876873: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.[0m
[34m2024-08-19 21:04:14.922981: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[0m
[34m2024-08-19 21:04:14.923028: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT

In [9]:
#This will locally load the images for using in a job
!mkdir /tmp/reviews/
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz /tmp/reviews/

mkdir: cannot create directory ‘/tmp/reviews/’: File exists
fatal error: An error occurred (403) when calling the HeadObject operation: Forbidden


In [10]:
resnet_transformer = resnet.transformer(
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    output_path="s3://summer2024-sagemaker-data-bucket/Outputs/TestResnetOutput/"
)

INFO:sagemaker.tensorflow.model:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating model with name: tensorflow-training-2024-08-12-18-13-16-879


In [17]:
from sagemaker.tensorflow import TensorFlowModel
model = TensorFlowModel(model_data='s3://summer2024-sagemaker-data-bucket/modelArtifacts/TestModelArtifact/tensorflow-training-2024-08-12-18-01-55-986/output/model.tar.gz', role=sagemaker.get_execution_role(), framework_version = "2.14")

In [24]:
import tarfile
import boto3
import tensorflow as tf
import pandas as pd

s3 = boto3.client("s3")

s3_bucket_name = 'summer2024-sagemaker-data-bucket'
s3_image_path = 'train/images/'

s3.download_file(s3_bucket_name, "modelArtifacts/TestModelArtifact/tensorflow-training-2024-08-19-20-58-59-112/output/model.tar.gz", "temp.tar.gz")

In [11]:
tar = tarfile.open("temp.tar.gz", "r:gz")
tar.extractall()
tar.close()

In [None]:
#The tar.gz file is a compressed verison of a Keras saved model object, thus using load_model will give you all the Keras funcationality
new_model = tf.keras.models.load_model('000001')

In [13]:
#This downloads all the images to be able to test the model locally
readAllResults = False
response = s3.list_objects_v2(
    Bucket= s3_bucket_name,
    Prefix= s3_image_path,
    StartAfter = s3_image_path)

while not readAllResults:
    print(response['IsTruncated'])
    
    readAllResults = not response['IsTruncated']
    finalKey = ""
    for content in response.get('Contents', []):
        #print(content["Key"])
        s3.download_file(s3_bucket_name, content["Key"], "images/" + content["Key"].split("/")[-1]) #You should probably delete this folder once done
        #os.remove(local_prediciton_temp_store + "/" + content["Key"].split("/")[-1])
        finalKey = content["Key"]
    
    if not response['IsTruncated']:
        break
    
    response = s3.list_objects_v2(
        Bucket = s3_bucket_name,
        Prefix = s3_image_path,
        StartAfter = finalKey)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
False


In [None]:
s3.download_file(s3_bucket_name, "train/train_lst.lst", "ground_truth.csv")
training_labels = pd.read_csv("ground_truth.csv", sep = "\t", names = ["index", "Classification", "File"])
training_labels = training_labels.sort_values(by=['File'])
training_labels = training_labels["Classification"].to_numpy().tolist()

In [None]:
x_train = tf.keras.preprocessing.image_dataset_from_directory("images/", 
                                                                  labels = training_labels,
                                                                  image_size = (192,100), 
                                                                  label_mode = "binary", 
                                                                  color_mode = "rgb",
                                                                  batch_size = None)

In [None]:
new_model.evaluate(x_train.map(lambda x, y: (tf.keras.applications.resnet_v2.preprocess_input(x), y)).batch(50))

2024-08-19 21:57:35.440360: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 12592800 exceeds 10% of free system memory.
2024-08-19 21:57:35.451475: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 61440000 exceeds 10% of free system memory.
2024-08-19 21:57:35.690832: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65228800 exceeds 10% of free system memory.
2024-08-19 21:57:35.731526: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 15360000 exceeds 10% of free system memory.
2024-08-19 21:57:35.747016: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 15360000 exceeds 10% of free system memory.




[5.456387519836426, 0.6462622880935669]

In [None]:
preds = new_model.predict(x_train.map(lambda x, y: (tf.keras.applications.resnet_v2.preprocess_input(x), y)).batch(50))

In [None]:
(preds > -.8).sum()

In [None]:
F1DF = pd.DataFrame(columns = ["Threshold", "Precision", "Recall", "F1"])
for threshold in range(0,1000):
    splitPreds = preds.apply(lambda x: > threshold/1000)
    TP = ((splitPreds == training_labels) & (training_labels == 1)).sum()
    FP = ((splitPreds != training_labels) & (training_labels == 0)).sum()
    FN = ((splitPreds != training_labels) & (training_labels == 1)).sum()
    precision = TP/(TP+FP) if (TP + FP) != 0 else 0
    recall = TP/(TP+FN) if (TP + FN) != 0 else 0
    f1 = (2*precision*recall)/(precision+recall) if (precision+recall) != 0 else 0
    temp = pd.DataFrame({"Threshold": threshold, "Precision": precision, "Recall": recall, "F1" : f1}, index=[0])
    F1DF = pd.concat([F1DF, temp], ignore_index = True)
F1DF = F1DF.fillna(0)

In [None]:
F1DF.plot(x = "Precision", y = "Recall")

In [9]:
#This code creates a BatchTransformJob but it is currently giving an error
#I am unsure what the problem is
#It will attempt to classify everything but will error on each one individually
break
resnet_transformer.transform("s3://summer2024-sagemaker-data-bucket/train/images/",content_type = "application/x-image", split_type="None")
resnet_transformer.wait()

NameError: name 'asdklfj' is not defined