<a href="https://colab.research.google.com/github/DenysNunes/data-examples/blob/main/spark/intermediate/validation_by_schema.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation by Schema

This notebook is a example how to validate a dataframe using a existing schema.

In [1]:
!pip install -q pyspark==3.1.1

from pyspark.sql import SparkSession
from pyspark.sql.types import Row, StringType
from pyspark.sql.functions import lit
from datetime import date
import random

spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("New Session Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .enableHiveSupport() \
    .getOrCreate()

def get_random():
  return random.randrange(2000, 5000, 100)

[K     |████████████████████████████████| 212.3 MB 14 kB/s 
[K     |████████████████████████████████| 198 kB 57.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


Default database with right schema

In [2]:
raw_rows = [
        Row(id=1, name='John', salary=get_random(), hire_date=date(2020, 1, 1)),
        Row(id=2, name='Joana', salary=get_random(), hire_date=date(2020, 1, 1)),
        Row(id=3, name='Maria', salary=get_random(), hire_date=date(2020, 1, 2)),
        Row(id=4, name='Sandra', salary=get_random(), hire_date=date(2020, 1, 2)),
        Row(id=5, name='Ben', salary=get_random(), hire_date=date(2020, 1, 3)),
        Row(id=6, name='Carl', salary=get_random(), hire_date=date(2020, 1, 3)),
        Row(id=7, name='Joseph', salary=get_random(), hire_date=date(2020, 1, 4)),
        Row(id=8, name='Oliver', salary=get_random(), hire_date=date(2020, 1, 4))
]

spark.sql("drop table if exists tb_parquet_salaries")

df = spark.createDataFrame(raw_rows)
df.write.saveAsTable(path='/tmp/schema-validation/tables/tb_parquet_salaries/', 
                     name='tb_parquet_salaries',
                     mode='overwrite')

Generating CSV data with corrupt data


In [3]:
import csv
import os

output = '/tmp/schema-validation/input/'

if not os.path.exists(output):
  os.mkdir(output)

header = ['id', 'name', 'salary', 'hire_date']
data = [
    [9, 'Phillip', get_random(), '2020-03-03'],
    ['failed schema'],
    [11, 'Mark', get_random()],
    [12, 'Andersen', get_random(), '2020-03-05']
]

with open(f'{output}salary.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for d in data:
       writer.writerow(d)

In [4]:
!cat /tmp/schema-validation/input/salary.csv

id,name,salary,hire_date
9,Phillip,2900,2020-03-03
failed schema
11,Mark,4100
12,Andersen,3400,2020-03-05


In [5]:
df_schema = spark.sql("select * from tb_parquet_salaries limit 1")
df_schema = df_schema.withColumn("bad_record", lit(None).cast(StringType()))

df_source = spark.read.option("mode","permissive") \
                          .option("sep", ",") \
			  .option("header", "true") \
			  .option("columnNameOfCorruptRecord", "bad_record") \
			  .csv(output, schema=df_schema.schema) \
        .cache() # must be cached 

df_source.show()

+----+--------+------+----------+-------------+
|  id|    name|salary| hire_date|   bad_record|
+----+--------+------+----------+-------------+
|   9| Phillip|  2900|2020-03-03|         null|
|null|    null|  null|      null|failed schema|
|  11|    Mark|  4100|      null| 11,Mark,4100|
|  12|Andersen|  3400|2020-03-05|         null|
+----+--------+------+----------+-------------+



DF with Failed Lines

In [6]:
df_source.select("bad_record").where("bad_record is not NULL").show()

+-------------+
|   bad_record|
+-------------+
|failed schema|
| 11,Mark,4100|
+-------------+



DF with Correct Lines

In [7]:
df_cleared = df_source.select("id", "name", "salary", "hire_date").where("bad_record is NULL")

df_cleared.show()

+---+--------+------+----------+
| id|    name|salary| hire_date|
+---+--------+------+----------+
|  9| Phillip|  2900|2020-03-03|
| 12|Andersen|  3400|2020-03-05|
+---+--------+------+----------+



Saving Correct DF

In [10]:
df_cleared.write.saveAsTable(name='tb_parquet_salaries', mode='append')

Selecting...

In [13]:
spark.table("tb_parquet_salaries").orderBy("id").show()

+---+--------+------+----------+
| id|    name|salary| hire_date|
+---+--------+------+----------+
|  1|    John|  4700|2020-01-01|
|  2|   Joana|  4800|2020-01-01|
|  3|   Maria|  2100|2020-01-02|
|  4|  Sandra|  2000|2020-01-02|
|  5|     Ben|  4000|2020-01-03|
|  6|    Carl|  4900|2020-01-03|
|  7|  Joseph|  3200|2020-01-04|
|  8|  Oliver|  4000|2020-01-04|
|  9| Phillip|  2900|2020-03-03|
| 12|Andersen|  3400|2020-03-05|
+---+--------+------+----------+

