In [1]:
from pyspark.sql import SparkSession
# Initialize Spark Session with MySQL JDBC Driver
spark = SparkSession.builder.master('local[2]').appName('datasource-api').getOrCreate()


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Read Common Options

- **`header`**: (CSV, JSON) Whether the first line is a header (`True` or `False`).
- **`inferSchema`**: (CSV, JSON) Automatically infer column data types.
- **`path`**: Specifies the file path.
- **`mode`**: 
    - PERMISSIVE (default): nulls are inserted for fields that could not be parsed correctly
    - DROPMALFORMED: drops lines that contain fields that could not be parsed
    - FAILFAST: aborts the reading if any malformed data is found


### Identify corrupted records in a dataset

In [2]:


sales_schema = '''
SalesOrderNumber string,
SalesOrderLineNumber int,
OrderDate string,
CustomerName string,
EmailAddress string,
Item string,
Quantity int,
UnitPrice double ,
TaxAmount double ,
_corrupt_record string
'''

In [3]:
#spark.read.csv("data/supply_sales_07012024.txt")

In [23]:
from pyspark.sql import functions as F


df = spark.read.schema(sales_schema).format("csv").options(header = True, mode = "PERMISSIVE", columnNameOfCorruptRecord = "_corrupt_record").load("data/supply_sales_07012024.txt")
df.printSchema()


root
 |-- SalesOrderNumber: string (nullable = true)
 |-- SalesOrderLineNumber: integer (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- TaxAmount: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [24]:
df.filter(F.col("_corrupt_record").isNotNull()).select("SalesOrderNumber","_corrupt_record").show(truncate=False)


+----------------+---------------------------------------------------------------------------------------------------------+
|SalesOrderNumber|_corrupt_record                                                                                          |
+----------------+---------------------------------------------------------------------------------------------------------+
|SO45346         |SO45346,1,7/1/2024,Dylan, Harris,dylan43@adventure-works.com,"Road-150 Red, 56",1,3578.27,286.2616       |
|SO45353         |SO45353,1,                                                                                               |
|SO45354         |SO45354,15/1/2024,Armando Dominguez,armando13@adventure-works.com,"Road-650 Black, 48",1,699.0982,55.9279|
+----------------+---------------------------------------------------------------------------------------------------------+



In [20]:
df.filter(F.col("_corrupt_record").isNull()).show(truncate=False)

+----------------+--------------------+---------+-----------------+------------------------------+------------------+--------+---------+---------+---------------+
|SalesOrderNumber|SalesOrderLineNumber|OrderDate|CustomerName     |EmailAddress                  |Item              |Quantity|UnitPrice|TaxAmount|_corrupt_record|
+----------------+--------------------+---------+-----------------+------------------------------+------------------+--------+---------+---------+---------------+
|SO45347         |1                   |7/1/2024 |Clarence Raji    |clarence35@adventure-works.com|Road-650 Black, 52|1       |699.0982 |55.9279  |null           |
|SO45345         |1                   |7/1/2024 |Bonnie Yuan      |bonnie12@adventure-works.com  |Road-150 Red, 52  |1       |3578.27  |286.2616 |null           |
|SO45348         |1                   |7/1/2024 |Leah Guo         |leah14@adventure-works.com    |Road-150 Red, 44  |1       |3578.27  |286.2616 |null           |
|SO45349         |1   

In [30]:

#PERMISSIVE,DROPMALFORMED,FAILFAST

df= spark.read.csv("data/supply_sales_06012024.dat",sep=":",schema=sales_schema,header=True,mode = "PERMISSIVE",columnNameOfCorruptRecord='_corrupt_record', quote='"', escape='\\',multiLine=True)
df.printSchema()

df.filter(F.col("_corrupt_record").isNull()).show()


root
 |-- SalesOrderNumber: string (nullable = true)
 |-- SalesOrderLineNumber: integer (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- TaxAmount: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)

+----------------+--------------------+---------+-----------------+--------------------+--------------------+--------+---------+---------+---------------+
|SalesOrderNumber|SalesOrderLineNumber|OrderDate|     CustomerName|        EmailAddress|                Item|Quantity|UnitPrice|TaxAmount|_corrupt_record|
+----------------+--------------------+---------+-----------------+--------------------+--------------------+--------+---------+---------+---------------+
|         SO45345|                   1| 4/1/2024|      Bonnie Yuan|bonnie12@adventur...|    R

In [31]:
df.filter(F.col("_corrupt_record").isNotNull()).show(truncate=False)

+----------------+--------------------+-----------------+-----------------------------+------------------------------+------------------+--------+---------+---------+-----------------------------------------------------------------------------------------------------------+
|SalesOrderNumber|SalesOrderLineNumber|OrderDate        |CustomerName                 |EmailAddress                  |Item              |Quantity|UnitPrice|TaxAmount|_corrupt_record                                                                                            |
+----------------+--------------------+-----------------+-----------------------------+------------------------------+------------------+--------+---------+---------+-----------------------------------------------------------------------------------------------------------+
|SO45347         |1                   |4/1/2024         |Clarence Raji                |clarence35@adventure-works.com|Road-650 Black, 52|1       |699.0982 |null     |SO45347:1

In [29]:
df.filter(F.col("_corrupt_record").isNull()).show(truncate=False)

+----------------+--------------------+---------+-----------------+------------------------------+--------------------------------------------+--------+---------+---------+---------------+
|SalesOrderNumber|SalesOrderLineNumber|OrderDate|CustomerName     |EmailAddress                  |Item                                        |Quantity|UnitPrice|TaxAmount|_corrupt_record|
+----------------+--------------------+---------+-----------------+------------------------------+--------------------------------------------+--------+---------+---------+---------------+
|SO45347         |1                   |4/1/2024 |Clarence Raji    |clarence35@adventure-works.com|Road-650 Black, 52                          |1       |699.0982 |55.9279  |null           |
|SO45345         |1                   |4/1/2024 |Bonnie Yuan      |bonnie12@adventure-works.com  |Road-150 Red, 52                            |1       |3578.27  |286.2616 |null           |
|SO45348         |1                   |4/1/2024 |Leah G