In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
import pyspark.sql.types as T
import pyspark.sql.functions as F
from datetime import date, datetime
from decimal import Decimal
from dateutil.parser import parse

spark = SparkSession.builder.master('local').appName('app').getOrCreate()
sc = spark.sparkContext

In [2]:
COMMON_SCHEMA = StructType([StructField("trade_dt", T.DateType(), True),
                            StructField("rec_type", T.StringType(), False),
                            StructField("event_tm", T.TimestampType(), False),
                            StructField("event_seq_nb", T.IntegerType(), False),
                            StructField("trade_pr", T.DecimalType(10, 2), True)
                           ])

QUOTE_COLUMNS = ['trade_dt', 'rec_type', 'event_tm', 'event_seq_nb', 'trade_pr']

In [13]:
date(2000,1,1)

datetime.date(2000, 1, 1)

In [3]:
COMMON_SCHEMA[7].jsonValue()

IndexError: StructType index out of range

In [16]:
r = ["2020-08-05,abc,2020-08-05 09:30:00,1,12.34", "2020-08-05,abc,2020-08-05 09:30:0,1,12.34"]

In [17]:
rdd = sc.parallelize(r)

In [18]:
def parse_csv(line: str):
    record = line.split(',')
    
    translated_record = {c : record[i] for i, c in enumerate(QUOTE_COLUMNS)}        
    return get_common_event(translated_record, COMMON_SCHEMA)
    

In [19]:
parsed = rdd.map(parse_csv)

In [20]:
parsed.collect()

[[datetime.date(2020, 8, 5),
  'abc',
  datetime.datetime(2020, 8, 5, 9, 30),
  1,
  Decimal('12.34')],
 [datetime.date(2020, 8, 5),
  'abc',
  datetime.datetime(2020, 8, 5, 9, 30),
  1,
  Decimal('12.34')]]

In [27]:
df = parsed.toDF(COMMON_SCHEMA)

In [28]:
df.printSchema()

root
 |-- trade_dt: date (nullable = true)
 |-- rec_type: string (nullable = false)
 |-- event_tm: timestamp (nullable = false)
 |-- event_seq_nb: integer (nullable = false)
 |-- trade_pr: decimal(10,2) (nullable = true)



In [53]:
rdd = sc.parallelize([[date(2021,6,14), 'abc', datetime(2020, 8, 6, 9, 38, 8, 93000), 1, Decimal(12.34)],
                      [date(2021,6,15), 'def', datetime(2020, 8, 6, 9, 38, 8, 93000), 1, Decimal(12.34)]])

In [6]:
rdd.take(1)

[['2020-08-05,abc,2020-08-05 09:30:00,1,12.34']]

In [13]:
df = rdd.toDF()

In [17]:
df.printSchema()

root
 |-- _1: date (nullable = true)
 |-- _2: string (nullable = true)



In [15]:
def get_common_event(record: dict, struct_type : StructType):
    """ Validated event.  NonNull fields must be populated; conversions to correct data formats must succeed"""
    # for each field in
    # get the type
    # convert the string to type
    common_event_row = []
    for struct_field in struct_type:
        
        d = struct_field.jsonValue()
        field_name = d['name']
        field_type = d['type']
        field_nullable = d['nullable']
        
        # check nullable up front
        
        if field_type == 'date':
            converted_field = date.fromisoformat(record[field_name].strip())
                                                                  
        elif field_type == 'timestamp':
            converted_field = parse(record[field_name].strip())
            # converted_field = datetime.fromisoformat(record[field_name].strip())
        
        elif field_type == 'integer':
            converted_field = int(record[field_name].strip())
            
        elif field_type.startswith('decimal'):
            converted_field = Decimal(record[field_name].strip())
            
        elif field_type == 'string':
            converted_field = record[field_name].strip()
            
        common_event_row.append(converted_field)    
        
    return common_event_row


In [25]:
date.fromisoformat("2021-10-11")

ValueError: Invalid isoformat string: '2021-10-11 '

In [31]:
datetime(2020, 8, 6, 9, 38, 8, 93000)

datetime.datetime(2020, 8, 6, 9, 38, 8, 93000)

In [35]:
int("-77")

-77

In [None]:
# 2020-08-05 09:30:00