In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

columns = ['order_id', 'order_date', 'order_customer_id', 'order_status']

In [0]:
# READING CSV FILES

# Base
spark.read.csv('/public/retail_db/orders')

# 2 ways to specify options
    # Inside .csv() method
spark.read.csv('/public/retail_db/orders', sep=',', header='true')
    # Using .options()
spark\
    .read\
    .options(inferSchema='true', sep=',') \
    .csv('/public/retail_db/orders') \
    .toDF(*columns)

In [0]:
# Basic way to read CSV
# Notice that column names are predefined and all of them are strings

orders = spark.read.csv('/public/retail_db/orders')
orders.show(n=10)
orders.printSchema()

In [0]:
# We can use schema to define column names and types



orders = spark.read.schema(schema).csv('/public/retail_db/orders')
orders = spark.read.csv('/public/retail_db/orders', schema=schema)

orders.show(n=5)
orders.printSchema()

In [0]:
# We can also infer schema from the data itself

columns = ['order_id', 'order_date', 'order_customer_id', 'order_status']

orders = spark.read.option('inferSchema', 'true').csv('/public/retail_db/orders').toDF(*columns)
orders = spark.read.csv('/public/retail_db/orders', inferSchema='true').toDF(*columns)

orders.show(n=5)
orders.printSchema()

In [0]:
# We can also specify the separator

orders = spark.read.csv('/public/retail_db/orders', sep=',')