In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

columns = ['order_id', 'order_date', 'order_customer_id', 'order_status']

In [0]:
# READING CSV FILES

# Base
spark.read.csv('/public/retail_db/orders')

# 2 ways to specify options
    # Inside .csv() method
spark.read.csv('/public/retail_db/orders', sep=',', header='true')
    # Using .options()
spark\
    .read\
    .options(inferSchema='true', sep=',') \
    .csv('/public/retail_db/orders') \
    .toDF(*columns)

In [0]:
# Basic way to read CSV
# Notice that column names are predefined and all of them are strings

orders = spark.read.csv('/public/retail_db/orders')
orders.show(n=10)
orders.printSchema()

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
|  4|2013-07-25 00:00:...| 8827|         CLOSED|
|  5|2013-07-25 00:00:...|11318|       COMPLETE|
|  6|2013-07-25 00:00:...| 7130|       COMPLETE|
|  7|2013-07-25 00:00:...| 4530|       COMPLETE|
|  8|2013-07-25 00:00:...| 2911|     PROCESSING|
|  9|2013-07-25 00:00:...| 5657|PENDING_PAYMENT|
| 10|2013-07-25 00:00:...| 5648|PENDING_PAYMENT|
+---+--------------------+-----+---------------+
only showing top 10 rows

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [0]:
# We can use schema to define column names and types



orders = spark.read.schema(schema).csv('/public/retail_db/orders')
orders = spark.read.csv('/public/retail_db/orders', schema=schema)

orders.show(n=5)
orders.printSchema()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [0]:
# We can also infer schema from the data itself

columns = ['order_id', 'order_date', 'order_customer_id', 'order_status']

orders = spark.read.option('inferSchema', 'true').csv('/public/retail_db/orders').toDF(*columns)
orders = spark.read.csv('/public/retail_db/orders', inferSchema='true').toDF(*columns)

orders.show(n=5)
orders.printSchema()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [0]:
# We can also specify the separator

orders = spark.read.csv('/public/retail_db/orders', sep=',')