In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSQLExample")\
        .master("spark://spark-master:7077").getOrCreate() 
df = spark.read.format("csv")\
          .option("header", "true")\
          .option("inferSchema", "true")\
          .load("/home/jovyan/data/online-retail-dataset.csv")\
          .coalesce(5)
df.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [10]:
df.createOrReplaceTempView("dfTable")

In [11]:
spark.sql("SELECT COUNT(*) FROM dfTable").show()

+--------+
|count(1)|
+--------+
|  541909|
+--------+



In [12]:
spark.sql("SELECT approx_count_distinct(StockCode, 0.1) FROM dfTable").show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [13]:
spark.sql("SELECT min(Quantity), max(Quantity), sum(Quantity) FROM dfTable").show()

+-------------+-------------+-------------+
|min(Quantity)|max(Quantity)|sum(Quantity)|
+-------------+-------------+-------------+
|       -80995|        80995|      5176450|
+-------------+-------------+-------------+



In [14]:
spark.sql("SELECT count(*), InvoiceNo, CustomerId FROM dfTable GROUP BY InvoiceNo, CustomerId").show()

+--------+---------+----------+
|count(1)|InvoiceNo|CustomerId|
+--------+---------+----------+
|      76|   536846|     14573|
|      12|   537026|     12395|
|       5|   537883|     14437|
|      12|   538068|     17978|
|       7|   538279|     14952|
|      10|   538800|     16458|
|      12|   538942|     17346|
|       1|  C539947|     13854|
|      16|   540096|     13253|
|      27|   540530|     14755|
|      19|   541225|     14099|
|       4|   541978|     13551|
|      16|   542093|     17677|
|      63|   543188|     12567|
|      19|   543590|     17377|
|       1|  C543757|     13115|
|       1|  C544318|     12989|
|       1|   544578|     12365|
|      20|   545165|     16339|
|      30|   545289|     14732|
+--------+---------+----------+
only showing top 20 rows



In [15]:
from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull GROUP BY customerId, stockCode GROUPING SETS((customerId, stockCode)) ORDER BY CustomerId DESC, stockCode DESC").show()

+----------+---------+-------------+
|customerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85040A|           48|
|     18287|   85039B|          120|
|     18287|   85039A|           96|
|     18287|    84920|            4|
|     18287|    84584|            6|
|     18287|   84507C|            6|
|     18287|   72351B|           24|
|     18287|   72351A|           24|
|     18287|   72349B|           60|
|     18287|    47422|           24|
|     18287|    47421|           48|
|     18287|    35967|           36|
|     18287|    23445|           20|
|     18287|    23378|           24|
|     18287|    23376|           48|
|     18287|    23310|           36|
|     18287|    23274|           12|
|     18287|    23272|           12|
|     18287|    23269|           36|
+----------+---------+-------------+
only showing top 20 rows



In [16]:
# Stop the spark context
spark.stop()