In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.functions import explode

In [6]:
json_df = spark.read.json("../DataDummy/data/HE_TRAINING.json")
json_df.show()

+--------------------+------+
|                City|Hadoop|
+--------------------+------+
| [Mumbai, Hyderabad]|  6000|
|[NewYork, Washing...|  7000|
|    [Sidney, London]|  8000|
|   [Kolkata, Jaipur]|  9000|
+--------------------+------+



In [7]:
json_df.withColumn("City", explode("City")).show()

+----------+------+
|      City|Hadoop|
+----------+------+
|    Mumbai|  6000|
| Hyderabad|  6000|
|   NewYork|  7000|
|Washington|  7000|
|    Sidney|  8000|
|    London|  8000|
|   Kolkata|  9000|
|    Jaipur|  9000|
+----------+------+



# Date Format

In [25]:
from pyspark.sql.functions import date_format, current_timestamp

In [23]:
spark.range(1).select("id", date_format(current_timestamp(), "dd-MM-yyyy hh:mm:ss").alias("date_time")).show()

+---+-------------------+
| id|          date_time|
+---+-------------------+
|  0|29-01-2020 04:42:45|
+---+-------------------+



In [29]:
from pyspark.sql.functions import unix_timestamp

In [30]:
spark.range(1).select("id", unix_timestamp().alias("date_time")).show()

+---+----------+
| id| date_time|
+---+----------+
|  0|1580334355|
+---+----------+



In [31]:
from pyspark.sql.functions import lit, to_date

In [34]:
spark.range(1).select(to_date(lit("2020-01-29")).alias("date")).show()

+----------+
|      date|
+----------+
|2020-01-29|
+----------+



In [36]:
from pyspark.sql import Row 

dates = spark.sparkContext.parallelize(
    [ Row(1, "2018-01-01", 20000), 
     Row(1, "2018-01-02", 23000), 
     Row(1, "2018-01-03", 90000), 
     Row(1, "2018-01-04", 55000), 
     Row(1, "2018-01-05", 20000), 
     Row(1, "2018-01-06", 23000), 
     Row(1, "2018-01-07", 90000), 
     Row(1, "2018-01-08", 55000), 
     Row(2, "2018-01-01", 80000), 
     Row(2, "2018-01-02", 90000), 
     Row(2, "2018-01-03", 100000), 
     Row(2, "2018-01-04", 80000), 
     Row(2, "2018-01-05", 90000), 
     Row(2, "2018-01-06", 100000), 
     Row(2, "2018-01-07", 80000), 
     Row(2, "2018-01-08", 90000) ])

In [38]:
date_df = spark.createDataFrame(dates, ["id", "date", "fee"])
date_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- fee: long (nullable = true)



In [39]:
from pyspark.sql.functions import col

In [41]:
date_df = date_df.withColumn("date_cast", col("date").cast("date"))

date_df.printSchema()
date_df.show()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- fee: long (nullable = true)
 |-- date_cast: date (nullable = true)

+---+----------+------+----------+
| id|      date|   fee| date_cast|
+---+----------+------+----------+
|  1|2018-01-01| 20000|2018-01-01|
|  1|2018-01-02| 23000|2018-01-02|
|  1|2018-01-03| 90000|2018-01-03|
|  1|2018-01-04| 55000|2018-01-04|
|  1|2018-01-05| 20000|2018-01-05|
|  1|2018-01-06| 23000|2018-01-06|
|  1|2018-01-07| 90000|2018-01-07|
|  1|2018-01-08| 55000|2018-01-08|
|  2|2018-01-01| 80000|2018-01-01|
|  2|2018-01-02| 90000|2018-01-02|
|  2|2018-01-03|100000|2018-01-03|
|  2|2018-01-04| 80000|2018-01-04|
|  2|2018-01-05| 90000|2018-01-05|
|  2|2018-01-06|100000|2018-01-06|
|  2|2018-01-07| 80000|2018-01-07|
|  2|2018-01-08| 90000|2018-01-08|
+---+----------+------+----------+



In [42]:
from pyspark.sql.functions import window,sum 

In [43]:
# calculating the total fee every day across courses

totalFeeEveryDay = date_df.groupBy(window("date_cast", "1 days")).agg(sum("fee").alias("total_fee"))\
.select("window.start", "window.end", "total_fee") 

totalFeeEveryDay.orderBy("start").show()

+-------------------+-------------------+---------+
|              start|                end|total_fee|
+-------------------+-------------------+---------+
|2017-12-31 19:00:00|2018-01-01 19:00:00|   100000|
|2018-01-01 19:00:00|2018-01-02 19:00:00|   113000|
|2018-01-02 19:00:00|2018-01-03 19:00:00|   190000|
|2018-01-03 19:00:00|2018-01-04 19:00:00|   135000|
|2018-01-04 19:00:00|2018-01-05 19:00:00|   110000|
|2018-01-05 19:00:00|2018-01-06 19:00:00|   123000|
|2018-01-06 19:00:00|2018-01-07 19:00:00|   170000|
|2018-01-07 19:00:00|2018-01-08 19:00:00|   145000|
+-------------------+-------------------+---------+



In [45]:
#Total fee collected in every two day
date_df.groupBy(window("date_cast", "2 days")).agg(sum("fee").alias("total_fee"))\
.select("window.start", "window.end", "total_fee").show()

+-------------------+-------------------+---------+
|              start|                end|total_fee|
+-------------------+-------------------+---------+
|2018-01-04 19:00:00|2018-01-06 19:00:00|   233000|
|2018-01-06 19:00:00|2018-01-08 19:00:00|   315000|
|2018-01-02 19:00:00|2018-01-04 19:00:00|   325000|
|2017-12-31 19:00:00|2018-01-02 19:00:00|   213000|
+-------------------+-------------------+---------+

