In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType, DoubleType
from pyspark.sql import functions as F
import psycopg2

In [2]:
spark = SparkSession.builder.appName("test_data_processing")\
    .config("spark.jars", "/opt/bitnami/spark/jars/postgresql-42.5.0.jar")\
    .enableHiveSupport().getOrCreate()


schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_first_name", StringType(), True),
    StructField("customer_last_name", StringType(), True),
    StructField("category_name", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("customer_segment", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("customer_country", StringType(), True),
    StructField("customer_region", StringType(), True),
    StructField("delivery_status", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("order_id", StringType(), True),
    StructField("ship_date", StringType(), True),
    StructField("shipping_type", StringType(), True),
    StructField("days_for_shipment_scheduled", IntegerType(), True),
    StructField("days_for_shipment_real", IntegerType(), True),
    StructField("order_item_discount", DoubleType(), True),
    StructField("sales_per_order", DoubleType(), True),
    StructField("order_quantity", IntegerType(), True),
    StructField("profit_per_order", DoubleType(), True)
])

In [3]:
df = spark.read.option("header",True).schema(schema).csv("./data/ecommerce/Ecommerce_data.csv")

In [4]:
# df.printSchema()
df = df.withColumn("order_date", F.to_date("order_date", "yyyy-MM-dd"))
df = df.withColumn("ship_date", F.to_date("ship_date", "yyyy-MM-dd"))
df = df.withColumn("customer_name", F.concat_ws(",","customer_first_name","customer_last_name"))

In [5]:
df_new = df.drop("profit_per_order", "customer_first_name", "customer_last_name" )

In [6]:
df_new.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- category_name: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_country: string (nullable = true)
 |-- customer_region: string (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_id: string (nullable = true)
 |-- ship_date: date (nullable = true)
 |-- shipping_type: string (nullable = true)
 |-- days_for_shipment_scheduled: integer (nullable = true)
 |-- days_for_shipment_real: integer (nullable = true)
 |-- order_item_discount: double (nullable = true)
 |-- sales_per_order: double (nullable = true)
 |-- order_quantity: integer (nullable = true)
 |-- customer_name: string (nullable = false)



In [7]:
df_new.write.csv("./output/ecommerce_transformed.csv")

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/home/jovyan/work/output/ecommerce_transformed.csv already exists. Set mode as "overwrite" to overwrite the existing path.

In [8]:
df_pd = df_new.toPandas()

In [9]:
db_url = "jdbc:postgresql://postgres:5432/airflow"
db_properties = {"user": "airflow", "password": "airflow", "driver": "org.postgresql.Driver"}
df_new.write.format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", "public.ecommerce") \
    .option("user", db_properties["user"]) \
    .option("password", db_properties["password"]) \
    .option("driver", db_properties["driver"]) \
    .mode("append") \
    .save()