### Lets Load basic spark and glue libraries.

In [1]:
import boto3
import botocore
import sys
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.utils import getResolvedOptions
from awsglue.job import Job

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1648483062748_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Lets initiate a spark context and spark variable to do our big data processing

In [2]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = SparkSession.builder.appName("index_create").getOrCreate()
job = Job(glueContext)
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
s3_client = boto3.client('s3')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Lets get the data downloaded

``unzip data_prep_component.zip -d data_prep_component``

``!aws s3 cp data_prep_component/ s3://datascience-ml-workshop-prep/data_prep_component/ --recursive``

### Below are some of the utlity functions that we will be making use of 

In [39]:
def load_latest(spark, bucket_name, source, upsert_or_delete):
    prefix = str(source + '/' + upsert_or_delete+ '/')
#     path ='s3://datascience-ml-workshop-prep/data_prep_component/upserts/03-21-2022'
    path = get_most_recent_s3_object(bucket_name, prefix)
    print("Currently Reading", path)
    df = spark.read.csv(path, header=True, sep='\t')
    df = df.drop('_c0')
    return df

def get_most_recent_s3_object(bucket_name,prefix):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator( "list_objects_v2" )
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    latest = []
    for page in page_iterator:
        if "Contents" in page:
            latest.append(max(page['Contents'], key=lambda x: x['LastModified']))
    total_max = max(latest, key=lambda x: x['LastModified'])
    latest_data_path = total_max['Key']
    latest_data_path = "/".join(latest_data_path.split("/")[:-1])
    return str('s3://'+ bucket_name + '/' + latest_data_path)  

def process_incrememtal_upserts(spark,delta_upserts, processed_data):
    df = processed_data.unionByName(delta_upserts)
    w = Window.partitionBy('dp_unique_key').orderBy(F.desc('updated_date'))
    df = df.withColumn('Rank',F.dense_rank().over(w))
    final_upsert_data = df.filter(df.Rank == 1).drop(df.Rank)
    return final_upsert_data

def process_first_upserts(spark,delta_upserts): 
    return delta_upserts
    

def process_incrememtal_deletes(spark, delta_deletes, processed_data):
    if delta_deletes.count()>0:
        data_post_delete_processing = processed_data.join(delta_deletes, 'dp_unique_key','left_anti')
        return data_post_delete_processing
    else:
        return None
    
def write_files(data, bucket_name, destination, script_type='processed_data'):
    data.write.mode("overwrite").csv("s3://" + bucket_name+ "/" + destination + "/tmp/" + script_type + "_tmp", header=True, sep='\t')
    data =spark.read.csv("s3://"+bucket_name + "/"+ destination +"/tmp/"+ script_type + "_tmp", header=True, sep='\t')
    data.write.mode("overwrite").csv("s3://" + bucket_name + "/" + destination +"/"+ script_type, header=True, sep='\t')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Define some useful varibles

In [9]:
bucket_name = 'datascience-ml-workshop-prep'
source = 'data_prep_component'
destination = 'labeling_data_component/data_prep_output'
run = "incremental"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Step 1: Lets load the DP data that we got today 21st of March! and process the upserts

In [11]:
try:
    processed_data = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data/", header=True, sep='\t')
except:
    run="first"
delta_upserts = load_latest(spark, bucket_name, source,  'upserts')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Currently Reading s3://datascience-ml-workshop-prep/data_prep_component/upserts/03-21-2022

In [12]:
if run=="first":
    print("processing_first_run")
    final_data = process_first_upserts(spark, delta_upserts)
else:
    print("processing_incremental_run")
    final_data = process_incrememtal_upserts(spark,delta_upserts,processed_data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

processing_first_run

In [13]:
print("Count of 1st batch upserts:", delta_upserts.count())
print("Count After 1st batch upserts is processed:", final_data.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Count of 1st batch upserts: 72004
Count After 1st batch upserts is processed: 72004

In [16]:
write_files(final_data, bucket_name, destination)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Step2: Lets now process the deletes

In [18]:
delta_deletes = load_latest(spark, bucket_name, source,  'deletes')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Currently Reading s3://datascience-ml-workshop-prep/data_prep_component/deletes/03-21-2022

In [20]:
processed_data = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data", header=True, sep='\t')
data_post_delete_processing = process_incrememtal_deletes(spark, delta_deletes, processed_data)
if data_post_delete_processing is not None:
        write_files(data_post_delete_processing, bucket_name, destination)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
processed_data = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data", header=True, sep='\t')
print("Count of 1st batch Delete:", delta_deletes.count())
print("Count After 1st batch upserts & Deletes are processed:", processed_data.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Count of 1st batch Delete: 0
Count After 1st batch upserts & Deletes are processed: 72004

### Imagine now its tomorrow 22nd March, We get another batch of Upserts and Deletes, Lets try processing that.

### But before that lets mimick Data Platform Api by running below  command in the terminal.

``bash run_DP_API.sh``

### Now lets do the same again  so that new data gets processed

### Lets process upserts first

In [29]:
bucket_name = 'datascience-ml-workshop-prep'
source = 'data_prep_component'
destination = 'labeling_data_component/data_prep_output'
run = "incremental"
try:
    processed_data = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data/", header=True, sep='\t')
except:
    run="first"
delta_upserts = load_latest(spark, bucket_name, source,  'upserts')
if run=="first":
    print("processing_first_run")
    final_data = process_first_upserts(spark, delta_upserts)
else:
    print("processing_incremental_run")
    final_data = process_incrememtal_upserts(spark,delta_upserts,processed_data)
    
print("Count of 2nd batch upserts:", delta_upserts.count())
print("Count After 2nd batch upserts is processed:", final_data.count())
write_files(final_data, bucket_name, destination)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Currently Reading s3://datascience-ml-workshop-prep/data_prep_component/upserts/03-22-2022
processing_incremental_run
Count of 2nd batch upserts: 2690
Count After 2nd batch upserts is processed: 74686

In [28]:
(72004+2690)-74686

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

8

### These were the 8 records that got updated out of 2690 total new updates( 2682 inserts +8 updates)

### Lets prrocess deletes now.

In [32]:
delta_deletes = load_latest(spark, bucket_name, source,  'deletes')
processed_data = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data", header=True, sep='\t')
data_post_delete_processing = process_incrememtal_deletes(spark, delta_deletes, processed_data)
if data_post_delete_processing is not None:
    write_files(data_post_delete_processing, bucket_name, destination)
processed_data = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data", header=True, sep='\t')
print("Count of 2nd batch Delete:", delta_deletes.count())
print("Count After 2nd batch upserts & Deletes are processed:", processed_data.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Currently Reading s3://datascience-ml-workshop-prep/data_prep_component/deletes/03-22-2022
Count of 2nd batch Delete: 4
Count After 2nd batch upserts & Deletes are processed: 74682

### These were the 4 deletes that were processed(74686-4 = 74682)

## Enrichment Script

In [41]:
import boto3
import botocore
import sys
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.utils import getResolvedOptions
from awsglue.job import Job

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
data_part_2 = spark.read.csv('s3://'+bucket_name +'/'+ source + '/id_entity_mapper.csv',header=True )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [35]:
data_part_1 = spark.read.csv("s3://"+bucket_name+ "/"+destination+"/processed_data", header=True, sep='\t')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [36]:
joined_data = data_part_1.join(data_part_2, ['Id','dp_unique_key'], 'inner').drop('_c0')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [37]:
joined_data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

74680

In [40]:
write_files(joined_data, bucket_name, destination, 'enriched_data')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…