In [7]:
import sys
from pyspark.sql import functions as F
from pyspark.sql.functions import * 
from awsglue.job import Job
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext

sc = SparkContext.getOrCreate()
gc = GlueContext(sc)



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
# #CREATE DATAFRAME WITH NULL VALUES
# source_df = gc.create_data_frame_from_catalog( 
#     database = "invoices-db", 
#     table_name = "trucape_invoices", 
# )

source_dyf = gc.create_dynamic_frame.from_options(
    's3',
    {
        "paths": [
            's3://sol-dev-source/TruCape-Invoices'
        ],
        "recurse" : True
    },
    "csv",
    transformation_ctx = "source_dyf",
    headerText = True)



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
# source_df = DropNullFields.apply(frame = source_dyf)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

null_fields []

In [20]:
source_df = source_dyf.toDF()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
source_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+------+-------------------+------------+--------+-------------+------------+--------+----------+
|              col0|  col1|               col2|        col3|    col4|         col5|        col6|    col7|      col8|
+------------------+------+-------------------+------------+--------+-------------+------------+--------+----------+
|External Reference|Number|       Counterparty| Amount Due |Currency|Cost Currency|Costing Rate|Due Date|Issue Date|
|           SC25648|138099|         JSC TANDER|  21,124.80 |     USD|          ZAR|     15.5654|   7-Apr|    21-Jan|
|           SC25646|138127|HORIZON FRESH FRUIT|  17,100.00 |     USD|          ZAR|     15.5142|  17-Mar|    21-Jan|
|           SC25645|138129|HORIZON FRESH FRUIT|  17,100.00 |     USD|          ZAR|     15.5142|  17-Mar|    21-Jan|
+------------------+------+-------------------+------------+--------+-------------+------------+--------+----------+

In [None]:
# (b) Create array with desired columns
old_columns = source_df.schema.names
new_columns = [
    field.lower().replace(" ", "_").replace(".", "_") for field in old_columns
]

# (c) Overwrite and persist `new_columns`
df = reduce(
    lambda df, idx: df.withColumnRenamed(old_columns[idx], new_columns[idx]),
    range(len(old_columns)),
    df,
)

# (d) Convert back to DynamicFrame
datasource = datasource.fromDF(df, glue_context, "datasource")

# Write DynamicFrame as Parquet
datasink = glue_context.write_dynamic_frame_from_options(
    frame=datasource,
    connection_type="s3",
    connection_options={"path": "s3://path/to/target/prefix/"},
    format="parquet",
)

In [23]:

# SET DECIMAL PLACES
costing_rate="Costing Rate"
format_costing_rate = format_number(round(costing_rate ,4), 4)
amount_due="amount due"
format_amount_due=format_number(round(amount_due ,2), 2)

source_df = source_df.withColumn(costing_rate, format_costing_rate) \
# amount due gives null value - not sure why



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
source_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+------+-------------------+------------+--------+-------------+-------+--------+----------+
|              col0|  col1|               col2|        col3|    col4|         col5|   col6|    col7|      col8|
+------------------+------+-------------------+------------+--------+-------------+-------+--------+----------+
|External Reference|Number|       Counterparty| Amount Due |Currency|Cost Currency|   null|Due Date|Issue Date|
|           SC25648|138099|         JSC TANDER|  21,124.80 |     USD|          ZAR|15.5654|   7-Apr|    21-Jan|
|           SC25646|138127|HORIZON FRESH FRUIT|  17,100.00 |     USD|          ZAR|15.5142|  17-Mar|    21-Jan|
|           SC25645|138129|HORIZON FRESH FRUIT|  17,100.00 |     USD|          ZAR|15.5142|  17-Mar|    21-Jan|
+------------------+------+-------------------+------------+--------+-------------+-------+--------+----------+

In [None]:
# ADD YEAR TO DATE COLUMNS
due_date="due date"
issue_date="issue date"
year = F.lit("-2022")
due_date_add_year = F.concat(col(due_date),year)
issue_date_add_year = F.concat(col(issue_date),year)
                               
source_df = source_df.withColumn(due_date, due_date_add_year) \
    .withColumn(issue_date, issue_date_add_year)
    


In [None]:
# FORMAT DATES
source_df = source_df.select(
    col("external reference"), \
    col("number"), \
    col("counterparty"), \
    col("amount due"), \
    col("currency"), \
    col("cost currency"), \
    col("costing rate"), \
    to_date(F.col("due date"), "d-MMM-yyy").alias("due date"), \
    to_date(F.col("issue date"), "d-MMM-yyy").alias("issue date")) \
    
source_df.write.csv(path='s3://sol-dev-output/TruCape-Invoices', mode='overwrite')