# Assignment 2 - Spark Dataframes
***Note***: All the dataset files were stored in the same folder as this notebook.

In [1]:
import os
import pyspark
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
spark

## 1. 15 Points
**Datafile**: BreadBasket_DMV.csv

**Solve**: What is the most popular (most sold) between the 8:00AM and 8:59AM for each day?

Example output (not actual solution)

    2016-10-30, Pastry

    2016-10-31, Coffee
     :
     :

### Approach:
1. Import `BreadBasket_DMS.csv` into a dataframe
2. Extract dates in `YYYY-MM-DD` format from the `Date` column and times in `hh:mm:ss` format from the `Time` column
3. Filter the data by `Time` in the range of `08:00:00` and `08:59:00` inclusive and remove rows with `None` in the `Item` column
4. Group the data by `Date` and `Item`, aggregate the `sum` of `Transaction` for each `Item` aliased as `Total` and, sort by `Date` and `Total`
5. Group the data by `Date` and return the last `Item` and last `Total`

In [2]:
# 1. Import BreadBasket_DMS.csv into a dataframe
q1 = spark.read.option("header", True).option("InferSchema", True).csv("BreadBasket_DMS.csv")

# 2. Extract dates in `YYYY-MM-DD` format from the `Date` column and times in `hh:mm:ss` format from the `Time` column
from pyspark.sql.functions import col, to_date, date_format
q1 = q1.withColumn("Date", to_date(col("Date"), "YYYY-MM-DD"))
q1 = q1.withColumn("Time", date_format(col("Time"),"hh:mm:ss"))

# 3. Filter the data by `Time` in the range of `08:00:00` and `08:59:00` inclusive and remove rows with `None` in the `Item` column
q1 = q1.filter((col("Time") <= "08:59:00") & (col("Time") >= "08:00:00") & (col("Item") != "NONE"))

# 4. Group the data by `Date` and `Item`, aggregate the `sum` of `Transaction` for each `Item` aliased as `Total` and, sort by `Date` and `Total`
from pyspark.sql.functions import sum
q1 = q1.groupBy("Date","Item").agg(sum("Transaction").alias("Total")).sort("Date","Total")

# 5. Group the data by `Date` and return the last `Item` and last `Total`
from pyspark.sql.functions import last
q1 = q1.groupBy("Date").agg(last("Item").alias("Most Popular Iteam"),last("Total").alias("Total Transactions"))

# Display results
q1.show()

+----------+------------------+------------------+
|      Date|Most Popular Iteam|Total Transactions|
+----------+------------------+------------------+
|2016-10-31|             Bread|               165|
|2016-11-01|               Tea|               542|
|2016-11-02|            Coffee|              2064|
|2016-11-03|            Coffee|              1382|
|2016-11-04|            Coffee|               883|
|2016-11-05|             Bread|              3164|
|2016-11-07|            Coffee|               739|
|2016-11-08|             Bread|               816|
|2016-11-09|             Bread|               890|
|2016-11-10|            Coffee|              1879|
|2016-11-11|             Bread|              6067|
|2016-11-12|         Medialuna|              1104|
|2016-11-14|         Medialuna|              2555|
|2016-11-15|  Keeping It Local|              1343|
|2016-11-16|             Bread|              1409|
|2016-11-17|          Siblings|              2953|
|2016-11-18|            Coffee|