In [1]:
from pyspark.sql import Row, SparkSession

In [2]:
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()

In [4]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("../DataDummy/data/DataBricks_Training.csv")

In [5]:
df.createOrReplaceTempView("test")

In [7]:
#store in cache
spark.sql("CACHE TABLE test")

DataFrame[]

In [8]:
#check store cache
spark.catalog.isCached("test")

True

In [9]:
#clear cache
spark.sql("CLEAR CACHE")

DataFrame[]

In [10]:
#check store cache
spark.catalog.isCached("test")

False

In [12]:
#STORAGELEVEL
from pyspark import StorageLevel

In [13]:
spark.catalog.cacheTable("test")

In [14]:
spark.sql("SELECT * FROM test").show()

+---+-------------------+-----+----------+-----------+--------+
| ID|               Name|  Fee|     Venue|       Date|Duration|
+---+-------------------+-----+----------+-----------+--------+
|  1|          HE Hadoop| 9000|    Mumbai|01-Aug-2018|       2|
|  2|           HE Spark| 7000|   Kolkata|04-Aug-2018|       3|
|  3|        HE SparkSQL| 6000| Hyderabad|07-Aug-2018|       4|
|  4|     HE Spark Graph| 6000|   Chennai|10-Aug-2018|       5|
|  5|HE Machine Learning|10000|    London|13-Aug-2018|       2|
|  6|    HE Data Science|12000|Washington|16-Aug-2018|       3|
|  7|            HE Java| 6000|    Navada|19-Aug-2018|       4|
|  8|           HE Scala| 6000|   Newyork|22-Aug-2018|       5|
|  9|          HE Python| 6000|    Sydney|25-Aug-2018|       2|
| 10|            HE Unix| 7000| Singapore|28-Aug-2018|       3|
| 11|             HE C++| 5000|      Pune|31-Aug-2018|       4|
| 12|  HE Data Structure| 4000| New Delhi|03-Sep-2018|       5|
| 13|     HE Web Service| 6000|  Yokoham

In [15]:
spark.sql("CLEAR CACHE")

DataFrame[]

In [17]:
spark.sql("SELECT SUM(FEE) as FEE, VENUE FROM TEST GROUP BY VENUE").show()

+-----+----------+
|  FEE|     VENUE|
+-----+----------+
|46000| Singapore|
|37000| Frankfurt|
|50000|   Beijing|
|34000|    Navada|
|44000|   Chennai|
|29000|    Berlin|
|49000|    London|
|36000|    Mumbai|
|42000|    Sydney|
|37000|   Kolkata|
|46000|Washington|
|44000|   Newyork|
|51000| Hong Kong|
|46000|  Yokohama|
|36000|      Pune|
|39000| New Delhi|
|43000|     Osaka|
|36000|  Shenzhen|
|41000| Hyderabad|
|41000|     Dubai|
+-----+----------+
only showing top 20 rows



In [18]:
from pyspark.sql.functions import sum

In [19]:
df.agg(sum("fee").alias("total_fee")).show()

+---------+
|total_fee|
+---------+
|   855000|
+---------+



In [20]:
df.groupBy("venue").agg(sum("fee").alias("total_fee")).show()

+----------+---------+
|     venue|total_fee|
+----------+---------+
| Singapore|    46000|
| Frankfurt|    37000|
|   Beijing|    50000|
|    Navada|    34000|
|   Chennai|    44000|
|    Berlin|    29000|
|    London|    49000|
|    Mumbai|    36000|
|    Sydney|    42000|
|   Kolkata|    37000|
|Washington|    46000|
|   Newyork|    44000|
| Hong Kong|    51000|
|  Yokohama|    46000|
|      Pune|    36000|
| New Delhi|    39000|
|     Osaka|    43000|
|  Shenzhen|    36000|
| Hyderabad|    41000|
|     Dubai|    41000|
+----------+---------+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import avg

In [23]:
df.groupBy("venue").agg(avg("fee").alias("average")).show()

+----------+-------+
|     venue|average|
+----------+-------+
| Singapore| 9200.0|
| Frankfurt| 9250.0|
|   Beijing|10000.0|
|    Navada| 6800.0|
|   Chennai| 8800.0|
|    Berlin| 7250.0|
|    London| 9800.0|
|    Mumbai| 7200.0|
|    Sydney| 8400.0|
|   Kolkata| 7400.0|
|Washington| 9200.0|
|   Newyork| 8800.0|
| Hong Kong|10200.0|
|  Yokohama| 9200.0|
|      Pune| 7200.0|
| New Delhi| 7800.0|
|     Osaka| 8600.0|
|  Shenzhen| 9000.0|
| Hyderabad| 8200.0|
|     Dubai|10250.0|
+----------+-------+
only showing top 20 rows



In [27]:
from pyspark.sql.functions import max, min, count, avg

In [29]:
df.groupBy("fee").agg(sum("fee").alias("total"), max("venue"), min("venue"), avg("fee"), count("venue")).show()

+-----+------+----------+----------+--------+------------+
|  fee| total|max(venue)|min(venue)|avg(fee)|count(venue)|
+-----+------+----------+----------+--------+------------+
| 4000|  8000|     Osaka| New Delhi|  4000.0|           2|
|12000|228000|  Yokohama|   Beijing| 12000.0|          19|
| 5000|  5000|      Pune|      Pune|  5000.0|           1|
|10000|150000|  Yokohama|    Berlin| 10000.0|          15|
|15000| 30000|     Osaka| Hong Kong| 15000.0|           2|
| 6000|114000|  Yokohama|    Berlin|  6000.0|          19|
| 9000| 36000|    Mumbai|     Dubai|  9000.0|           4|
| 8000|144000|  Yokohama|   Chennai|  8000.0|          18|
| 7000|140000|Washington|   Beijing|  7000.0|          20|
+-----+------+----------+----------+--------+------------+

