In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master('local').getOrCreate()
sc = spark.sparkContext

In [2]:
# Define schema
from pyspark.sql.types import *
schema = StructType([
        StructField("purchase_id", IntegerType(), True),
        StructField("user_id", IntegerType(), True),
        StructField("product_id", IntegerType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("timestamp", DateType(), True)
    ])

In [3]:
# Extract
purchase = spark.read.format("csv").option("header", True).schema(schema).load("/hdfs/raw/purchase")

purchase.printSchema()
purchase.show()
purchase.createOrReplaceTempView("purchase")

root
 |-- purchase_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- timestamp: date (nullable = true)

+-----------+-------+----------+--------+----------+
|purchase_id|user_id|product_id|quantity| timestamp|
+-----------+-------+----------+--------+----------+
|          1|      1|         1|      10|2019-02-01|
|          2|      2|         2|      20|2019-02-01|
+-----------+-------+----------+--------+----------+



In [4]:
# Transform: new columns -> DAY - MONTH - YEAR
processed_purchase = spark.sql("""
SELECT 
    purchase_id,
    user_id,
    product_id,
    quantity,
    timestamp,
    
    DAY(timestamp) as day,
    MONTH(timestamp) as month,
    YEAR(timestamp) as year
    
FROM purchase

""")

processed_purchase.printSchema()
processed_purchase.show()

root
 |-- purchase_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- timestamp: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)

+-----------+-------+----------+--------+----------+---+-----+----+
|purchase_id|user_id|product_id|quantity| timestamp|day|month|year|
+-----------+-------+----------+--------+----------+---+-----+----+
|          1|      1|         1|      10|2019-02-01|  1|    2|2019|
|          2|      2|         2|      20|2019-02-01|  1|    2|2019|
+-----------+-------+----------+--------+----------+---+-----+----+



In [5]:
# Load into parquet
processed_purchase.write.format("parquet").mode("append").save("/hdfs/processed/purchase/")