# FPE Rank Model

## Set Up

In [8]:
import boto3
import time
import json
import os
import pandas as pd
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput
from io import StringIO

# job parameters
root_dir = f"/mnt/d/fpe/rank"
station_id = 29
#dataset = "FLOW-20240327"
model = "RANK-FLOW-20240402"

# aws parameters
AWS_PROFILE="conte-prod"
AWS_REGION="us-west-2"
JOB_ROLE_ARN="arn:aws:iam::694155575325:role/fpe-prod-sagemaker-execution-role"

In [9]:
# create sessions

def timestamp():
    return time.strftime("%Y%m%d-%H%M%S")

def get_batch_creds(session, role_arn):
    sts = session.client("sts")
    response = sts.assume_role(
        RoleArn=role_arn,
        RoleSessionName=f"fpe-sagemaker-session--{timestamp()}"
    )
    return response['Credentials']

session = boto3.Session(profile_name=AWS_PROFILE, region_name=AWS_REGION)
s3 = session.client("s3")

creds = get_batch_creds(session, JOB_ROLE_ARN)
sm_boto_session = boto3.Session(
    aws_access_key_id=creds['AccessKeyId'],
    aws_secret_access_key=creds['SecretAccessKey'],
    aws_session_token=creds['SessionToken'],
    region_name=AWS_REGION
)

sm_session = sagemaker.Session(boto_session = sm_boto_session)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/jdwalker/.config/sagemaker/config.yaml


In [10]:
# create paths and keys

job_name = f"fpe-rank-{station_id}-{model}-{timestamp()}"

# local paths
#dataset_dir = f"{root_dir}/{station_name}/{variable}/{dataset_version}"
model_dir = f"{root_dir}/{station_id}/models/{model}"
if not os.path.exists(model_dir):
    raise Exception(f"model_dir not found ({model_dir})")

model_bucket = "usgs-chs-conte-prod-fpe-models"
storage_bucket = "usgs-chs-conte-prod-fpe-storage"

model_key = f"rank/{station_id}/models/{model}"
input_key = f"{model_key}/input"
jobs_key = f"{model_key}/jobs"
checkpoints_key = f"{model_key}/checkpoints"
transform_key = f"{model_key}/transform"

s3_input_path = f"s3://{model_bucket}/{input_key}"
s3_output_path = f"s3://{model_bucket}/{jobs_key}"
s3_checkpoint_path = f"s3://{model_bucket}/{checkpoints_key}"
s3_transform_path = f"s3://{model_bucket}/{transform_key}"
(job_name, s3_input_path, s3_output_path, s3_checkpoint_path, s3_transform_path)

('fpe-rank-29-RANK-FLOW-20240328-20240402-085814',
 's3://usgs-chs-conte-prod-fpe-models/rank/29/models/RANK-FLOW-20240328/input',
 's3://usgs-chs-conte-prod-fpe-models/rank/29/models/RANK-FLOW-20240328/jobs',
 's3://usgs-chs-conte-prod-fpe-models/rank/29/models/RANK-FLOW-20240328/checkpoints',
 's3://usgs-chs-conte-prod-fpe-models/rank/29/models/RANK-FLOW-20240328/transform')

## Upload Input to S3

In [12]:
# upload input files to s3

for subdir, dirs, files in os.walk(f"{model_dir}/input"):
    for file in files:
        s3_key = f"{input_key}/{file}"
        print(f"uploading: {file} -> {s3_key}")
        s3.upload_file(Filename=os.path.join(subdir, file), Bucket=model_bucket, Key=s3_key)

uploading: annotations.csv -> rank/29/models/RANK-FLOW-20240328/input/annotations.csv
uploading: images-months.png -> rank/29/models/RANK-FLOW-20240328/input/images-months.png
uploading: images-ts.png -> rank/29/models/RANK-FLOW-20240328/input/images-ts.png
uploading: images-values-cfd.png -> rank/29/models/RANK-FLOW-20240328/input/images-values-cfd.png
uploading: images-values-hist.png -> rank/29/models/RANK-FLOW-20240328/input/images-values-hist.png
uploading: images-values-ts.png -> rank/29/models/RANK-FLOW-20240328/input/images-values-ts.png
uploading: images.csv -> rank/29/models/RANK-FLOW-20240328/input/images.csv
uploading: manifest.json -> rank/29/models/RANK-FLOW-20240328/input/manifest.json
uploading: pairs-timestamps.png -> rank/29/models/RANK-FLOW-20240328/input/pairs-timestamps.png
uploading: pairs-values-delta.png -> rank/29/models/RANK-FLOW-20240328/input/pairs-values-delta.png
uploading: pairs-values.png -> rank/29/models/RANK-FLOW-20240328/input/pairs-values.png
upload

## Train Model

### Run Training Job

In [9]:
estimator = PyTorch(
    entry_point="train.py",
    source_dir="src",
    py_version="py39",
    framework_version="1.13.1",
    role="arn:aws:iam::694155575325:role/fpe-prod-sagemaker-execution-role",
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    volume_size=100,
    hyperparameters={
        "epochs": 20
    },
    output_path=s3_output_path,
    checkpoint_s3_uri=s3_checkpoint_path,
    code_location=s3_output_path,
    disable_output_compression=False,
    sagemaker_session=sm_session
)

In [13]:
input_images = TrainingInput(
    s3_data = f"{s3_input_path}/manifest.json",
    s3_data_type = "ManifestFile",
    input_mode = "File"
)
f"{s3_input_path}/manifest.json"

's3://usgs-chs-conte-prod-fpe-models/rank/29/models/RANK-FLOW-20240328/input/manifest.json'

In [16]:
estimator.fit({ "images": input_images, "values": s3_input_path }, job_name=job_name, wait=False)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: fpe-rank-WESTB0-20240328-215259


### Stop Training Job

In [13]:
# sm_session.stop_training_job("pytorch-training-2024-03-29-01-49-35-117")

### Download Training Job Output

In [33]:
print(f"job: {job_name}")

output_dir = f"{model_dir}/output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
output_file = f"{output_dir}/output.tar.gz"

output_key = f"{jobs_key}/{job_name}/output/output.tar.gz"
print(f"downloading: s3://{model_bucket}/{output_key} -> {output_file}")
s3.download_file(Bucket=model_bucket, Key=output_key, Filename=output_file)

print(f"extracting: {output_file} -> {output_dir}")
!tar -xzvf "{output_file}" -C "{output_dir}"
output_file

job: fpe-rank-WESTB0-20240328-215259
downloading: s3://usgs-chs-conte-prod-fpe-models/rank/WESTB0/FLOW_CFS/20240327/models/20240328/jobs/fpe-rank-WESTB0-20240328-215259/output/output.tar.gz -> /mnt/d/fpe/rank/West Brook 0_01171100/FLOW_CFS/20240327/models/20240328/output/output.tar.gz
extracting: /mnt/d/fpe/rank/West Brook 0_01171100/FLOW_CFS/20240327/models/20240328/output/output.tar.gz -> /mnt/d/fpe/rank/West Brook 0_01171100/FLOW_CFS/20240327/models/20240328/output
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
args.json
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
metrics.csv


'/mnt/d/fpe/rank/West Brook 0_01171100/FLOW_CFS/20240327/models/20240328/output/output.tar.gz'

## Batch Transform

In [14]:
transform_images_file = f"{model_dir}/input/images.csv"
transform_images = pd.read_csv(transform_images_file)
print(f"# images: {len(transform_images)}")

# images: 30565


In [15]:
transform_images_key = f"{transform_key}/images.csv"
print(f"uploading: {transform_images_file} -> {transform_images_key}")
s3.upload_file(Filename=transform_images_file, Bucket=model_bucket, Key=transform_images_key)

uploading: /mnt/d/fpe/rank/29/models/RANK-FLOW-20240328/input/images.csv -> rank/29/models/RANK-FLOW-20240328/transform/images.csv


In [16]:
transform_images

Unnamed: 0,split,image_id,timestamp,filename,url,value
0,test-in,441411,2022-02-01T18:00:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
1,test-in,441412,2022-02-01T18:15:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
2,train,441413,2022-02-01T18:30:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
3,test-in,441414,2022-02-01T18:45:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
4,train,441415,2022-02-01T18:59:59Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
...,...,...,...,...,...,...
30560,test-out,4532559,2024-03-07T16:00:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,208.00
30561,test-out,4532560,2024-03-07T16:15:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,204.00
30562,test-out,4532561,2024-03-07T16:30:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,204.00
30563,test-out,4532562,2024-03-07T16:45:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,197.00


In [17]:
manifest = transform_images['filename'].to_list()
manifest.insert(0, {"prefix": f"s3://{storage_bucket}/"})

manifest_key = f"{transform_key}/manifest.json"
body = json.dumps(manifest)
print(f"uploading transform manifest: {manifest_key} (n = {len(manifest) - 1})")
s3.put_object(Bucket=model_bucket, Key=manifest_key, Body=body)

uploading transform manifest: rank/29/models/RANK-FLOW-20240328/transform/manifest.json (n = 30565)


{'ResponseMetadata': {'RequestId': 'KZXTZF070Z7DZERK',
  'HostId': 'V0CQt82tSxb1rpqgvzQ06DDStcMCuM37M2HjKLeU8ASshm2u+SHzA71BFVOkZLgrXf/uAWeaZnU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'V0CQt82tSxb1rpqgvzQ06DDStcMCuM37M2HjKLeU8ASshm2u+SHzA71BFVOkZLgrXf/uAWeaZnU=',
   'x-amz-request-id': 'KZXTZF070Z7DZERK',
   'date': 'Tue, 02 Apr 2024 13:02:43 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"92a5c7a56257809a321e90dbb4d8adc3"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"92a5c7a56257809a321e90dbb4d8adc3"',
 'ServerSideEncryption': 'AES256'}

In [18]:
model_path = f"{s3_output_path}/{job_name}/output/model.tar.gz"
model_path

's3://usgs-chs-conte-prod-fpe-models/rank/29/models/RANK-FLOW-20240328/jobs/fpe-rank-29-RANK-FLOW-20240328-20240402-085814/output/model.tar.gz'

In [47]:
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    model_data=model_path,
    role="arn:aws:iam::694155575325:role/fpe-prod-sagemaker-execution-role",
    py_version="py38",
    framework_version="1.12",
    source_dir="src/",
    entry_point="transform.py",
    sagemaker_session = sm_session
)

In [48]:
transformer = pytorch_model.transformer(
    instance_count=1,
    instance_type="ml.c5.xlarge",
    output_path=s3_transform_path
)
(s3_transform_path)

INFO:sagemaker:Repacking model artifact (s3://usgs-chs-conte-prod-fpe-models/rank/WESTB0/FLOW_CFS/20240327/models/20240328/jobs/fpe-rank-WESTB0-20240328-215259/output/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-west-2-694155575325/pytorch-inference-2024-03-29-13-32-49-601/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-03-29-13-33-43-682


's3://usgs-chs-conte-prod-fpe-models/rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform'

In [50]:
transformer.transform(
    data=f"{s3_transform_path}/manifest.json",
    data_type="ManifestFile",
    content_type="image/jpg",
    job_name=job_name,
    wait=False,
)

INFO:sagemaker:Creating transform job with name: fpe-rank-WESTB0-20240328-215259-transform


In [132]:
sm_session.stop_transform_job(f"{job_name}")

INFO:sagemaker:Stopping transform job: pytorch-inference-2023-09-29-00-44-57-671


### Post-process Batch Transform

In [64]:
session

Session(region_name=None)

In [62]:
def process_transform_output (session, transform_key, images_file, images_key, job_size = 5000):
    lambda_client = session.client("lambda")
    df = pd.read_csv(images_file)
    skip = 0
    while skip < len(df):
        payload = {
            "action": "process_transform_output",
            "bucket_name": model_bucket,
            "data_file": images_key,
            "data_prefix": transform_key,
            "output_prefix": transform_key,
            "n": job_size,
            "skip": skip
        }
        print(f"invoke: skip={skip}, n={job_size} ({skip} to {skip + job_size - 1})")
        lambda_client.invoke(
            FunctionName="fpe-prod-lambda-models",
            InvocationType="Event",
            Payload=json.dumps(payload)
        )
        skip += job_size
    return df

In [70]:
(transform_key, transform_images_file, transform_images_key)

('rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform',
 '/mnt/d/fpe/rank/West Brook 0_01171100/FLOW_CFS/20240327/models/20240328/input/images.csv',
 'rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/images.csv')

In [71]:
process_transform_output(session, transform_key, transform_images_file, transform_images_key)

invoke: skip=0, n=5000 (0 to 4999)
invoke: skip=5000, n=5000 (5000 to 9999)
invoke: skip=10000, n=5000 (10000 to 14999)
invoke: skip=15000, n=5000 (15000 to 19999)
invoke: skip=20000, n=5000 (20000 to 24999)
invoke: skip=25000, n=5000 (25000 to 29999)
invoke: skip=30000, n=5000 (30000 to 34999)


Unnamed: 0,split,image_id,timestamp,filename,url,value
0,test-in,441411,2022-02-01T18:00:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
1,test-in,441412,2022-02-01T18:15:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
2,train,441413,2022-02-01T18:30:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
3,test-in,441414,2022-02-01T18:45:00Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
4,train,441415,2022-02-01T18:59:59Z,imagesets/e8d465f6-5784-4231-967f-9000428e9748...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,4.71
...,...,...,...,...,...,...
30560,test-out,4532559,2024-03-07T16:00:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,208.00
30561,test-out,4532560,2024-03-07T16:15:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,204.00
30562,test-out,4532561,2024-03-07T16:30:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,204.00
30563,test-out,4532562,2024-03-07T16:45:00Z,imagesets/4cafbbd0-66b4-49da-a5b3-ff0d8e93e4cf...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,197.00


In [75]:
def combine_predictions (session, transform_key, images_file, transform_path, job_size = 5000):
    s3 = session.client("s3")
    df = pd.read_csv(images_file)
    keys = [f"{transform_key}/predictions-{skip:05d}-{(skip + job_size - 1):05d}.csv" for skip in range(0, len(df), job_size)]

    dfs = []
    for key in keys:
        print(key)
        csv_obj = s3.get_object(Bucket=model_bucket, Key=key)
        csv_data = csv_obj['Body'].read().decode('utf-8')
        dfs.append(pd.read_csv(StringIO(csv_data)))

    df = pd.concat(dfs, ignore_index=True)

    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    output_key = f"{transform_key}/predictions.csv"
    s3.put_object(Body=csv_buffer.getvalue(), Bucket=model_bucket, Key=output_key)
    if not os.path.exists(transform_path):
        os.makedirs(transform_path)
    df.to_csv(f"{transform_path}/predictions.csv", index=False)
    df


In [78]:
(transform_key, transform_images_file, transform_path)

('rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform',
 '/mnt/d/fpe/rank/West Brook 0_01171100/FLOW_CFS/20240327/models/20240328/input/images.csv',
 's3://usgs-chs-conte-prod-fpe-models/rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform')

In [79]:
combine_predictions(session, transform_key, transform_images_file, f"{model_dir}/transform")

rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-00000-04999.csv
rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-05000-09999.csv
rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-10000-14999.csv
rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-15000-19999.csv
rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-20000-24999.csv
rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-25000-29999.csv
rank/WESTB0/FLOW_CFS/20240327/models/20240328/transform/predictions-30000-34999.csv
