In [4]:
from pyspark.sql import SparkSession


In [5]:
spark = SparkSession.builder.appName("dataFrame").getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
spark

In [12]:
columns = ['product','region','amount']
frameData = [
        ('pen','north','4'),
        ('car','north','40'),
        ('pen','north','3'),
        ('toy','north','76'),
        ('book','north','7'),
        ('car','north','11'),
        ('pen','south','5'),
        ('car','south','90'),
        ('pen','south','31'),
        ('toy','south','23'),
        ('book','south','30'),
        ('car','south','8'),
        ('pen','west','125'),
        ('car','west','1'),
        ('pen','west','3'),
        ('toy','west','38'),
        ('book','west','17'),
        ('car','west','23'),
        ('pen','east','56'),
        ('car','east','9'),
        ('pen','east','3'),
        ('toy','east','213'),
        ('book','east','20'),
        ('car','east','81'), 
]

In [13]:
df = spark.createDataFrame(frameData,columns)

In [14]:
df.head()
df.count()

24

In [16]:
df.createOrReplaceTempView('tableDataFrame')

In [22]:
spark.sql(
    """
       with topProduct as (
            select product,region,sum(amount) as sales_data
            from tableDataFrame
            group by region,product
       ),
       rankProduct as(
           select product,region,sales_data, rank() over(partition by region order by sales_data desc) as rank_sales
           from topProduct
       )
       select product,region,sales_data,rank_sales from rankProduct where rank_sales <= 3
    """
).show(12)

+-------+------+----------+----------+
|product|region|sales_data|rank_sales|
+-------+------+----------+----------+
|    toy|  east|     213.0|         1|
|    car|  east|      90.0|         2|
|    pen|  east|      59.0|         3|
|    toy| north|      76.0|         1|
|    car| north|      51.0|         2|
|    pen| north|       7.0|         3|
|   book| north|       7.0|         3|
|    car| south|      98.0|         1|
|    pen| south|      36.0|         2|
|   book| south|      30.0|         3|
|    pen|  west|     128.0|         1|
|    toy|  west|      38.0|         2|
+-------+------+----------+----------+
only showing top 12 rows



In [27]:
from pyspark.sql.functions import sum,col,rank
from pyspark.sql.window import Window

In [31]:
sales_datas = [
    ("North", "Laptop", 5000),
    ("North", "Phone", 7000),
    ("North", "Tablet", 3500),
    ("North", "Monitor", 1200),
    ("North", "Keyboard", 750),
    ("North", "Mouse", 600),
    ("South", "Laptop", 6000),
    ("South", "Phone", 8400),
    ("South", "Tablet", 4500),
    ("South", "Monitor", 2000),
    ("South", "Keyboard", 1000),
    ("South", "Mouse", 540),
    ("East", "Laptop", 7000),
    ("East", "Phone", 9000),
    ("East", "Tablet", 4000),
    ("East", "Monitor", 2000),
    ("East", "Keyboard", 700),
    ("East", "Mouse", 660),
]

In [32]:
colums_df = ["region","product","amount"]

In [33]:
df_frame = spark.createDataFrame(sales_datas,colums_df)

In [34]:
df_frame.show()

+------+--------+------+
|region| product|amount|
+------+--------+------+
| North|  Laptop|  5000|
| North|   Phone|  7000|
| North|  Tablet|  3500|
| North| Monitor|  1200|
| North|Keyboard|   750|
| North|   Mouse|   600|
| South|  Laptop|  6000|
| South|   Phone|  8400|
| South|  Tablet|  4500|
| South| Monitor|  2000|
| South|Keyboard|  1000|
| South|   Mouse|   540|
|  East|  Laptop|  7000|
|  East|   Phone|  9000|
|  East|  Tablet|  4000|
|  East| Monitor|  2000|
|  East|Keyboard|   700|
|  East|   Mouse|   660|
+------+--------+------+



In [35]:
df_group = df_frame.groupBy("product","region").agg(sum("amount").alias("total_sales"))

In [36]:
df_group.show()

+--------+------+-----------+
| product|region|total_sales|
+--------+------+-----------+
|   Phone| North|       7000|
|  Laptop| North|       5000|
|  Tablet| North|       3500|
| Monitor| North|       1200|
|Keyboard| North|        750|
|   Phone| South|       8400|
|   Mouse| North|        600|
|  Laptop| South|       6000|
|   Mouse| South|        540|
| Monitor| South|       2000|
|Keyboard| South|       1000|
|  Tablet| South|       4500|
|  Tablet|  East|       4000|
| Monitor|  East|       2000|
|   Mouse|  East|        660|
|   Phone|  East|       9000|
|  Laptop|  East|       7000|
|Keyboard|  East|        700|
+--------+------+-----------+



In [37]:
windowSpefic = Window.partitionBy("region").orderBy(col("total_sales").desc())

In [38]:
rank_product = df_group.withColumn("rank_pr",rank().over(windowSpefic))

In [39]:
rank_product.show()



+--------+------+-----------+-------+
| product|region|total_sales|rank_pr|
+--------+------+-----------+-------+
|   Phone|  East|       9000|      1|
|  Laptop|  East|       7000|      2|
|  Tablet|  East|       4000|      3|
| Monitor|  East|       2000|      4|
|Keyboard|  East|        700|      5|
|   Mouse|  East|        660|      6|
|   Phone| North|       7000|      1|
|  Laptop| North|       5000|      2|
|  Tablet| North|       3500|      3|
| Monitor| North|       1200|      4|
|Keyboard| North|        750|      5|
|   Mouse| North|        600|      6|
|   Phone| South|       8400|      1|
|  Laptop| South|       6000|      2|
|  Tablet| South|       4500|      3|
| Monitor| South|       2000|      4|
|Keyboard| South|       1000|      5|
|   Mouse| South|        540|      6|
+--------+------+-----------+-------+



                                                                                

In [42]:
top3pr = rank_product.filter(col("rank_pr") <=3)

In [43]:
top3pr.show()

+-------+------+-----------+-------+
|product|region|total_sales|rank_pr|
+-------+------+-----------+-------+
|  Phone|  East|       9000|      1|
| Laptop|  East|       7000|      2|
| Tablet|  East|       4000|      3|
|  Phone| North|       7000|      1|
| Laptop| North|       5000|      2|
| Tablet| North|       3500|      3|
|  Phone| South|       8400|      1|
| Laptop| South|       6000|      2|
| Tablet| South|       4500|      3|
+-------+------+-----------+-------+



25/03/17 15:38:47 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 148526 ms exceeds timeout 120000 ms
25/03/17 15:38:50 WARN SparkContext: Killing executors is not supported by current scheduler.
