In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql import functions as f
from lib.logger import Log4j

spark = SparkSession.builder \
            .master("local[3]") \
            .appName("Misc Transformations") \
            .getOrCreate()

logger = Log4j(spark)

In [2]:
orders_list = [("01", "02", 350, 1),
                   ("01", "04", 580, 1),
                   ("01", "07", 320, 2),
                   ("02", "03", 450, 1),
                   ("02", "06", 220, 1),
                   ("03", "01", 195, 1),
                   ("04", "09", 270, 3),
                   ("04", "08", 410, 2),
                   ("05", "02", 350, 1)]

In [3]:
product_list = [("01", "Scroll Mouse", 250, 20),
                    ("02", "Optical Mouse", 350, 20),
                    ("03", "Wireless Mouse", 450, 50),
                    ("04", "Wireless Keyboard", 580, 50),
                    ("05", "Standard Keyboard", 360, 10),
                    ("06", "16 GB Flash Storage", 240, 100),
                    ("07", "32 GB Flash Storage", 320, 50),
                    ("08", "64 GB Flash Storage", 430, 25)]

In [4]:
order_df = spark.createDataFrame(orders_list).toDF("order_id", "prod_id", "unit_price", "qty")
order_df.show()

+--------+-------+----------+---+
|order_id|prod_id|unit_price|qty|
+--------+-------+----------+---+
|      01|     02|       350|  1|
|      01|     04|       580|  1|
|      01|     07|       320|  2|
|      02|     03|       450|  1|
|      02|     06|       220|  1|
|      03|     01|       195|  1|
|      04|     09|       270|  3|
|      04|     08|       410|  2|
|      05|     02|       350|  1|
+--------+-------+----------+---+



In [5]:
product_df = spark.createDataFrame(product_list).toDF("prod_id", "prod_name", "list_price", "qty")
product_df.show()

+-------+-------------------+----------+---+
|prod_id|          prod_name|list_price|qty|
+-------+-------------------+----------+---+
|     01|       Scroll Mouse|       250| 20|
|     02|      Optical Mouse|       350| 20|
|     03|     Wireless Mouse|       450| 50|
|     04|  Wireless Keyboard|       580| 50|
|     05|  Standard Keyboard|       360| 10|
|     06|16 GB Flash Storage|       240|100|
|     07|32 GB Flash Storage|       320| 50|
|     08|64 GB Flash Storage|       430| 25|
+-------+-------------------+----------+---+



In [6]:
join_expr = order_df.prod_id == product_df.prod_id

Full Outer Join

In [7]:
order_df.join(product_df, join_expr, "outer") \
                .show()

+--------+-------+----------+----+-------+-------------------+----------+----+
|order_id|prod_id|unit_price| qty|prod_id|          prod_name|list_price| qty|
+--------+-------+----------+----+-------+-------------------+----------+----+
|      01|     07|       320|   2|     07|32 GB Flash Storage|       320|  50|
|      03|     01|       195|   1|     01|       Scroll Mouse|       250|  20|
|      04|     09|       270|   3|   null|               null|      null|null|
|    null|   null|      null|null|     05|  Standard Keyboard|       360|  10|
|      04|     08|       410|   2|     08|64 GB Flash Storage|       430|  25|
|      02|     03|       450|   1|     03|     Wireless Mouse|       450|  50|
|      01|     02|       350|   1|     02|      Optical Mouse|       350|  20|
|      05|     02|       350|   1|     02|      Optical Mouse|       350|  20|
|      02|     06|       220|   1|     06|16 GB Flash Storage|       240| 100|
|      01|     04|       580|   1|     04|  Wireless

Left Outer Join

In [9]:
product_renamed_df = product_df.withColumnRenamed("qty", "reorder_qty")

order_df.join(product_renamed_df, join_expr, "left") \
        .drop(product_renamed_df.prod_id) \
        .select("order_id", "prod_id", "prod_name", "unit_price", "list_price", "qty") \
        .sort("order_id") \
        .show()

+--------+-------+-------------------+----------+----------+---+
|order_id|prod_id|          prod_name|unit_price|list_price|qty|
+--------+-------+-------------------+----------+----------+---+
|      01|     07|32 GB Flash Storage|       320|       320|  2|
|      01|     02|      Optical Mouse|       350|       350|  1|
|      01|     04|  Wireless Keyboard|       580|       580|  1|
|      02|     03|     Wireless Mouse|       450|       450|  1|
|      02|     06|16 GB Flash Storage|       220|       240|  1|
|      03|     01|       Scroll Mouse|       195|       250|  1|
|      04|     08|64 GB Flash Storage|       410|       430|  2|
|      04|     09|               null|       270|      null|  3|
|      05|     02|      Optical Mouse|       350|       350|  1|
+--------+-------+-------------------+----------+----------+---+



two null values
in prod_name and lis_price can we replace it with prod_id and unit_price

In [10]:
from pyspark.sql.functions import expr

order_df.join(product_renamed_df, join_expr, "left") \
        .drop(product_renamed_df.prod_id) \
        .select("order_id", "prod_id", "prod_name", "unit_price", "list_price", "qty") \
        .withColumn("prod_name", expr("coalesce(prod_name, prod_id)")) \
        .withColumn("list_price", expr("coalesce(list_price, unit_price)")) \
        .sort("order_id") \
        .show()

+--------+-------+-------------------+----------+----------+---+
|order_id|prod_id|          prod_name|unit_price|list_price|qty|
+--------+-------+-------------------+----------+----------+---+
|      01|     04|  Wireless Keyboard|       580|       580|  1|
|      01|     07|32 GB Flash Storage|       320|       320|  2|
|      01|     02|      Optical Mouse|       350|       350|  1|
|      02|     03|     Wireless Mouse|       450|       450|  1|
|      02|     06|16 GB Flash Storage|       220|       240|  1|
|      03|     01|       Scroll Mouse|       195|       250|  1|
|      04|     09|                 09|       270|       270|  3|
|      04|     08|64 GB Flash Storage|       410|       430|  2|
|      05|     02|      Optical Mouse|       350|       350|  1|
+--------+-------+-------------------+----------+----------+---+



In [11]:
spark.stop()