## [AWS Tutorial: Youtube Tutorials on AWS Glue](https://www.youtube.com/playlist?list=PLO95rE9ahzRsdzmZ_ZT-3uOn1Nh2eEpWB)

-   Documentation: [awsglue](https://github.com/awslabs/aws-glue-libs/tree/master/awsglue)

#### [AWS Tutorials - Data Quality Check in AWS Glue ETL Pipeline](https://www.youtube.com/watch?v=44PbyHE57aM&t=111s)

![data_quality_check](./data_quality_check.png)

###### Reusable Workflow Configuration

```json
{
  "Comment": "Data Quality Check Workflow",
  "StartAt": "StartProfileJob",
  "States": {
    "StartProfileJob": {
      "Type": "Task",
      "Resource": "arn:aws:states:::databrew:startJobRun.sync",
      "Parameters": {
        "Name.$": "$.profilejobname"
      },
      "Next": "CheckDQOutput"
    },
    "CheckDQOutput": {
      "Type": "Task",
      "Resource": "arn:aws:states:::lambda:invoke",
      "OutputPath": "$.Payload",
      "Parameters": {
        "Payload.$": "$",
        "FunctionName": "<ARN_DQ_CHECK_LAMBDA_FUNCTION>"
      },
      "Retry": [
        {
          "ErrorEquals": [
            "Lambda.ServiceException",
            "Lambda.AWSLambdaException",
            "Lambda.SdkClientException"
          ],
          "IntervalSeconds": 2,
          "MaxAttempts": 6,
          "BackoffRate": 2
        }
      ],
      "Next": "Choice"
    },
    "Choice": {
      "Type": "Choice",
      "Choices": [
        {
          "Not": {
            "Variable": "$.dqstatus",
            "StringEquals": "SUCCEEDED"
          },
          "Next": "NotifyDQFail"
        }
      ],
      "Default": "Pass"
    },
    "NotifyDQFail": {
      "Type": "Task",
      "Resource": "arn:aws:states:::sns:publish",
      "Parameters": {
        "Message.$": "$",
        "TopicArn": "<ARN_SNS_TOPIC_FOR_NOTIFICATION>"
      },
      "Next": "Fail"
    },
    "Fail": {
      "Type": "Fail",
      "Error": "Data Quality Check Failed"
    },
    "Pass": {
      "Type": "Pass",
      "End": true
    }
  }
}
```

###### Lambda Code to Check DQ Rules Result

```python
import json
import boto3

def lambda_handler(event, context):
    # TODO implement
    bucketname = ""
    filename = ""
    jobname = event["JobName"]
    for o in event["Outputs"]:
        bucketname = o["Location"]["Bucket"]
        if "dq-validation" in o["Location"]["Key"]:
            filename = o["Location"]["Key"]

    s3 = boto3.resource('s3')

    content_object = s3.Object(bucketname, filename)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    profilejson = json.loads(file_content)
    
    ruleset = ""
    status = ""
    
    for rs in profilejson["rulesetResults"]:
        ruleset = rs["name"]
        status = rs["status"]

    return {
        'statusCode': 200,
        'dqstatus': status,
        'ruleset': ruleset,
        'jobname' : jobname
    }
```

###### ETL Pipeline Workflow Configuration


```json
{
  "Comment": "ETL Pipeline",
  "StartAt": "Glue StartJobRun",
  "States": {
    "Glue StartJobRun": {
      "Type": "Task",
      "Resource": "arn:aws:states:::glue:startJobRun.sync",
      "Parameters": {
        "JobName": "<GLUE_JOB_HANDLING_INGESTION"
      },
      "Next": "DQWorkflowCall"
    },
    "DQWorkflowCall": {
      "Type": "Task",
      "Resource": "arn:aws:states:::states:startExecution.sync:2",
      "Parameters": {
        "StateMachineArn": "<ARN_REUSABLE_WORKFLOW>",
        "Input": {
          "profilejobname": "<DATABREW_DATA_PROFILE_JOB_NAME>",
          "AWS_STEP_FUNCTIONS_STARTED_BY_EXECUTION_ID.$": "$$.Execution.Id"
        }
      },
      "End": true
    }
  }
}
```

#### [AWS Tutorials - Using Concurrent AWS Glue Jobs](https://www.youtube.com/watch?v=oqeiadeVEGI&list=PLO95rE9ahzRsdzmZ_ZT-3uOn1Nh2eEpWB&index=17)

###### Code

```python
#========== ingestionjob code ===========

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ['JOB_NAME', 'tablename', 'destination'])

glueContext = GlueContext(SparkContext.getOrCreate())

customerDF = glueContext.create_dynamic_frame.from_catalog(
             database="dojodb",
             table_name=args['tablename'], redshift_tmp_dir="s3://dojo-dataset/scripts/")

glueContext.write_dynamic_frame.from_options(customerDF, connection_type = "s3", connection_options = {"path": args['destination']}, format = "csv")

#==== CLI to Run Jobs ====

aws glue start-job-run --job-name ingestionjob --arguments '{"--tablename":"postgres_public_customers","--destination":"s3://dojo-dataset/customers"}'

aws glue start-job-run --job-name ingestionjob --arguments '{"--tablename":"postgres_public_employees","--destination":"s3://dojo-dataset/employees"}'
```

#### [ETL | AWS Glue | AWS S3 | ETL Job | Detect and remediate personal identifiable information PII](https://www.youtube.com/watch?v=4ux-byYTZnA)

#### [AWS Tutorials - Using External Libraries in AWS Glue Job](https://www.youtube.com/watch?v=8_F5nrVjOII&list=PLO95rE9ahzRsdzmZ_ZT-3uOn1Nh2eEpWB&index=20)

##### Code

```python
# cleansinglib.py
from awsglue.transforms import *

def renamefield(df,oldname,newname):
    df = RenameField.apply(df, oldname, newname) 
    return df
```
---

```python
# datalib.py
def readdata(db,tbl,gc):
    df = gc.create_dynamic_frame.from_catalog(database=db,table_name=tbl, redshift_tmp_dir="s3://dojo-dataset/script/")
    return df

def writedata(df,folder,format,gc):
    gc.write_dynamic_frame.from_options(df, connection_type = "s3", connection_options = {"path": "s3://dojo-dataset/" + folder}, format = format)
```
---

```python
# gluejobcode1.py
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

import cleansinglib
import datalib

glueContext = GlueContext(SparkContext.getOrCreate())

df = datalib.readdata("sourcedb","srcpostgres_public_orderdetails",glueContext)

df = cleansinglib.renamefield(df,"amount","salesvalue")

datalib.writedata(df,"output/orderdetails","csv",glueContext)
```
---

```python
# gluejobcode2.py
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

import cleansinglib
import datalib

glueContext = GlueContext(SparkContext.getOrCreate())

df = datalib.readdata("sourcedb","srcpostgres_public_customers",glueContext)

datalib.writedata(df,"output/customers","json",glueContext)
```

#### [AWS Tutorials - Using Glue Job ETL from REST API Source to Amazon S3 Bucket Destination](https://www.youtube.com/watch?v=f5Coh7C7V7I&t=153s)

#### [Build a Spark pipeline to analyze streaming data using AWS Glue, Apache Hudi, S3 and Athena](https://www.youtube.com/watch?v=uJI6B4MPmoM&t=193s)