In [2]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json, io, zipfile
from datetime import date

from misc import load_from_yaml, save_to_yaml
import iam, s3, lf, rds, vpc, ec2

In [3]:
ACCOUNT_ID = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION = os.getenv('AWS_DEFAULT_REGION')
# boto3.setup_default_session(profile_name="AMominNJ")

In [4]:
ec2_client           = boto3.client('ec2', region_name=REGION)
ec2_resource         = boto3.resource('ec2', region_name=REGION)
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
stepfunctions_client = boto3.client('stepfunctions')
apigateway_client    = boto3.client('apigateway')
lsn_client           = boto3.client('lambda')
events_client        = boto3.client('events')


# [Glue](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html)

- [AWS Glue ETL scripts in PySpark](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-python.html)
- [Using interactive sessions with Microsoft Visual Studio Code](https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions-vscode.html)

##### Glue API

In [None]:
# [method for method in dir(glue_client) if not method.startswith("_")]

###### Triggers


1. `batch_get_triggers()`
2. `create_trigger()`
2. `delete_trigger()`
1. `get_trigger()`
1. `get_triggers()`
1. `list_triggers()`
2. `start_trigger()`
3. `stop_trigger()`
4. `update_trigger()`


###### Crawlers



1. `batch_get_crawlers()`
1. `create_crawler()`
1. `delete_crawler()`
1. `get_crawler()`
1. `get_crawler_metrics()`
1. `get_crawlers()`
1. `list_crawlers()`
1. `start_crawler()`
1. `start_crawler_schedule()`
1. `stop_crawler()`
1. `stop_crawler_schedule()`
1. `update_crawler()`
1. `update_crawler_schedule()`


###### Jobs


1. `batch_get_jobs()`
1. `batch_stop_job_run()`
1. `create_job()`
1. `delete_job()`
1. `get_job()`
1. `get_job_bookmark()`
1. `get_job_run()`
1. `get_job_runs()`
1. `get_jobs()`
1. `list_jobs()`
1. `reset_job_bookmark()`
1. `start_job_run()`
1. `update_job()`
1. `update_job_from_source_control()`
1. `update_source_control_from_job()`


###### Workflows


1. `batch_get_workflows()`
1. `create_workflow()`
1. `delete_workflow()`
1. `get_workflow()`
1. `get_workflow_run()`
1. `get_workflow_run_properties()`
1. `get_workflow_runs()`
1. `list_workflows()`
1. `put_workflow_run_properties()`
1. `resume_workflow_run()`
1. `start_workflow_run()`
1. `stop_workflow_run()`
1. `update_workflow()`

### Workflow

In [None]:
workflow_name = "MyGlueWorkflow"
response = glue_client.create_workflow(
    Name=workflow_name,
    Description="Workflow to crawl S3 data and run a job."
)
print(f"Workflow '{workflow_name}' created successfully.")


- Add Workflow Triggers: Add triggers to orchestrate the crawler and job sequentially.

In [None]:
## Trigger for the Crawler:
trigger_name_crawler = "TriggerCrawler"

response = glue_client.create_trigger(
    Name=trigger_name_crawler,
    Type="ON_DEMAND",
    WorkflowName=workflow_name,
    Actions=[{
        "CrawlerName": crawler_name
    }],
)
print(f"Trigger '{trigger_name_crawler}' created successfully.")


In [None]:
# Trigger for the Job (dependent on the Crawler):
trigger_name_job = "TriggerJob"

response = glue_client.create_trigger(
    Name=trigger_name_job,
    Type="CONDITIONAL",
    WorkflowName=workflow_name,
    Actions=[{
        "JobName": job_name
    }],
    Predicate={
        "Conditions": [{
            "LogicalOperator": "EQUALS",
            "CrawlerName": crawler_name,
            "CrawlState": "SUCCEEDED"
        }]
    }
)
print(f"Trigger '{trigger_name_job}' created successfully.")


- Start and Monitor Workflow Execution

In [None]:
response = glue_client.start_workflow_run(Name=workflow_name)
workflow_run_id = response['RunId']
print(f"Workflow started successfully with RunId: {workflow_run_id}")

In [None]:
while True:
    response = glue_client.get_workflow_run(
        Name=workflow_name,
        RunId=workflow_run_id
    )
    status = response['Run']['Status']
    print(f"Workflow status: {status}")
    if status in ['COMPLETED', 'FAILED', 'STOPPED']:
        break
    time.sleep(10)
