In [2]:
import sys
import pyspark
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext,SparkConf
from awsglue.context import GlueContext
from pyspark.sql import Window,SparkSession
from pyspark.sql.types import StringType,StructType,StructField,ArrayType,IntegerType
from pyspark.sql.functions import struct,arrays_zip, explode,col,collect_list,when,expr,row_number,concat_ws,array_distinct,lit,trim,when,array_contains,filter,size,from_json,to_json
from awsglue import DynamicFrame
from awsglue.job import Job

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
import boto3, json
from awsglue import DynamicFrame
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import col, row_number, col, row_number

def get_dyf_from_s3(glueContext, source_config, format='csv'):
    cnn_options = {
        "paths": [source_config['s3_path'] + source_config['source'] + "/"], 
        "recurse": True,
        "compression": "gzip"
    }
    if ('groupFiles' in source_config):
        cnn_options['groupFiles'] = source_config['groupFiles']
    if ('groupSize' in source_config):
        cnn_options['groupSize'] = source_config['groupSize']
    if ('useS3ListImplementation' in source_config):
        cnn_options['useS3ListImplementation'] = source_config['useS3ListImplementation']
    return glueContext.create_dynamic_frame.from_options( \
            connection_type="s3", \
            connection_options=cnn_options, \
            format=format, \
            format_options={"withHeader": True})

def read_json_from_s3(bucket_name, path):
    s3 = boto3.resource('s3')
    content_object = s3.Object(bucket_name, path)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    return json.loads(file_content)


def transform(glueContext, source_config, config, source_inclussions = None):
    dyf_map = {}
    for el_name, el_config in config.items():
        if el_name in source_inclussions:
            source = el_config["source"]
            destination = el_config["destination"]
            mappings = el_config["mappings"]
            autoincrement_column = el_config["autoincrement_column"]

            # Read CSV from the source path
            if source_config['type'] == 's3':
                dyf = get_dyf_from_s3(glueContext, source_config)

            # Apply column renaming and data type transformations
            dyf_schema = dyf.schema()
            dyf_t = dyf.apply_mapping(list(map(lambda t: (t[0], dyf_schema.getField(t[0]).dataType.typeName(), t[2], t[3]) if t[1] == "Dynamic" else tuple(t), mappings)))
            if autoincrement_column:
                autoincrement_order_by = el_config["autoincrement_order_by"]
                window_spec_id = Window.orderBy(autoincrement_order_by)
                df_t = dyf_t.toDF().withColumn(autoincrement_column, row_number().over(window_spec_id))\
                        .withColumn(autoincrement_column, col(autoincrement_column))
                
                dyf_t = DynamicFrame.fromDF(df_t, glueContext, "dyf_t")
            
            dyf_map[el_name] = {'table': destination, 'data': dyf_t}
    return dyf_map

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
datasource_mat_plant_attr= glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options={"paths": ["s3://delta-glue-tests-2/laive_test/test_deltalake/zt024_attr_test/data/"],"recurse":True},
    format="parquet",
    format_options={
        "withHeader": True
    }
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
readcsv=datasource_mat_plant_attr.toDF().show(100,truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+------------------+------------+-----+-----+----------+----------+---------+--------------+--------------+
|EKGRP|EKNAM             |EKTEL       |LDEST|TELFX|TEL_NUMBER|TEL_EXTENS|SMTP_ADDR|ODQ_CHANGEMODE|ODQ_ENTITYCNTR|
+-----+------------------+------------+-----+-----+----------+----------+---------+--------------+--------------+
|G23  |Valor_Servicios   |            |     |     |          |          |         |              |0             |
|G03  |Servicios         |            |     |     |          |          |         |              |0             |
|001  |Einkäufer 1       |06227/341285|LP01 |     |          |          |         |              |0             |
|G03  |Servicios         |            |     |     |          |          |         |              |0             |
|001  |Einkäufer 1       |06227/341285|LP01 |     |          |          |         |              |0             |
|G04  |Activo            |            |     |     |          |          |         |     