In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession. \
    builder.appName("DF basic"). \
    config("spark.sql.warehouse.dir", "C:/data/warehouse"). \
    enableHiveSupport(). \
    getOrCreate()

In [2]:
# Hàm này dùng để tạo dataframe trong spark
orders_df = spark.read.csv("C:/data/orders_sh.csv", header=True, inferSchema=True)

In [3]:
orders_df.show(5)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|   00:00.0|      11599|         CLOSED|
|       2|   00:00.0|        256|PENDING_PAYMENT|
|       3|   00:00.0|      12111|       COMPLETE|
|       4|   00:00.0|       8827|         CLOSED|
|       5|   00:00.0|      11318|       COMPLETE|
+--------+----------+-----------+---------------+
only showing top 5 rows



In [4]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [3]:
orders_df.createOrReplaceTempView("orders_tmp")

In [6]:
spark.sql("SELECT * FROM orders_tmp").show(5)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|   00:00.0|      11599|         CLOSED|
|       2|   00:00.0|        256|PENDING_PAYMENT|
|       3|   00:00.0|      12111|       COMPLETE|
|       4|   00:00.0|       8827|         CLOSED|
|       5|   00:00.0|      11318|       COMPLETE|
+--------+----------+-----------+---------------+
only showing top 5 rows



In [7]:
orders_df.groupBy("order_status").count().show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



In [8]:
spark.sql("select order_status, count(*) from orders_tmp group by order_status").show()

+---------------+--------+
|   order_status|count(1)|
+---------------+--------+
|PENDING_PAYMENT|   15030|
|       COMPLETE|   22899|
|        ON_HOLD|    3798|
| PAYMENT_REVIEW|     729|
|     PROCESSING|    8275|
|         CLOSED|    7556|
|SUSPECTED_FRAUD|    1558|
|        PENDING|    7610|
|       CANCELED|    1428|
+---------------+--------+



In [4]:
top_10_df = orders_df.groupBy("customer_id").count().sort("count", ascending=False).limit(10)

In [5]:
top_10_df.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       5624|   15|
|       4320|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
+-----------+-----+



In [6]:
spark.sql("SELECT customer_id, count(*) FROM orders_tmp GROUP BY customer_id ORDER BY count(*) DESC LIMIT 10").show()

+-----------+--------+
|customer_id|count(1)|
+-----------+--------+
|       5897|      16|
|      12431|      16|
|        569|      16|
|       6316|      16|
|      12284|      15|
|       5624|      15|
|       4320|      15|
|       5283|      15|
|        221|      15|
|       5654|      15|
+-----------+--------+



In [7]:
orders_df.select("customer_id").distinct().count()

12405

In [6]:
spark.sql("select count(distinct customer_id) as nb_customer from orders_tmp").show()

+-----------+
|nb_customer|
+-----------+
|      12405|
+-----------+



In [8]:
top1_customer = orders_df.where("order_status = 'CLOSED'").groupBy("customer_id").count().sort("count", ascending=False).limit(1)

In [9]:
top1_customer.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
+-----------+-----+



In [10]:
spark.sql("SELECT customer_id, count(*) from orders_tmp WHERE order_status = 'CLOSED' GROUP BY customer_id ORDER BY count(*) DESC LIMIT 1").show()

+-----------+--------+
|customer_id|count(1)|
+-----------+--------+
|       1833|       6|
+-----------+--------+

