# Assignment 2 - Spark Dataframes
***Note***: All the dataset files were stored in the same folder as this notebook.

In [1]:
import os
import pyspark
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
spark

## 1. 15 Points
**Datafile**: BreadBasket_DMV.csv

**Solve**: What is the most popular (most sold) between the 8:00AM and 8:59AM for each day?

Example output (not actual solution)

    2016-10-30, Pastry

    2016-10-31, Coffee
     :
     :

### Approach:
1. Import `BreadBasket_DMS.csv` into a dataframe
2. Extract dates in `YYYY-MM-DD` format from the `Date` column and times in `hh:mm:ss` format from the `Time` column
3. Filter the data by `Time` in the range of `08:00:00` and `08:59:00` inclusive and remove rows with `None` in the `Item` column
4. Group the data by `Date` and `Item`, aggregate the `sum` of `Transaction` for each `Item` aliased as `Total` and, sort by `Date` and `Total`
5. Group the data by `Date` and return the last `Item` and last `Total`

In [26]:
# 1. Import BreadBasket_DMS.csv into a dataframe and `filter` out rows with `NONE` in the `Item` column
BreadBasket_DMV = spark.read.option("header", True).option("InferSchema", True).csv("BreadBasket_DMS.csv")
BreadBasket_DMV = BreadBasket_DMV.filter(col("Item") != "NONE")

# 2. Extract dates in `YYYY-MM-DD` format from the `Date` column and times in `hh:mm:ss` format from the `Time` column
from pyspark.sql.functions import col, to_date, date_format
BreadBasket_DMV = BreadBasket_DMV.withColumn("Date", to_date(col("Date"), "YYYY-MM-DD"))
BreadBasket_DMV = BreadBasket_DMV.withColumn("Time", date_format(col("Time"),"hh:mm:ss"))

# 3. Filter the data by `Time` in the range of `08:00:00` and `08:59:00` inclusive 
q1 = BreadBasket_DMV
q1 = q1.filter((col("Time") <= "08:59:00") & (col("Time") >= "08:00:00"))

# 4. Group the data by `Date` and `Item`, aggregate the `sum` of `Transaction` for each `Item` aliased as `Total` and, sort by `Date` and `Total`
from pyspark.sql.functions import sum
q1 = q1.groupBy("Date","Item").agg(sum("Transaction").alias("Total")).sort("Date","Total")

# 5. Group the data by `Date` and return the last `Item` and last `Total`
from pyspark.sql.functions import last
q1 = q1.groupBy("Date").agg(last("Item").alias("Most Popular Iteam"),last("Total").alias("Total Transactions"))

# Display results
q1.show()

+----------+------------------+------------------+
|      Date|Most Popular Iteam|Total Transactions|
+----------+------------------+------------------+
|2016-10-31|             Bread|               165|
|2016-11-01|               Tea|               542|
|2016-11-02|            Coffee|              2064|
|2016-11-03|            Coffee|              1382|
|2016-11-04|            Coffee|               883|
|2016-11-05|             Bread|              3164|
|2016-11-07|            Coffee|               739|
|2016-11-08|             Bread|               816|
|2016-11-09|             Bread|               890|
|2016-11-10|            Coffee|              1879|
|2016-11-11|             Bread|              6067|
|2016-11-12|         Medialuna|              1104|
|2016-11-14|         Medialuna|              2555|
|2016-11-15|  Keeping It Local|              1343|
|2016-11-16|             Bread|              1409|
|2016-11-17|          Siblings|              2953|
|2016-11-18|            Coffee|

## 2. 15 Points
**Datafile**: BreadBasket_DMV.csv

**Solve**: What is the most common item bought along with “Brownie”? (items bought in the same transaction)

### Approach:
To qualify as an item bought in the same transaction, we will consider items bought at the same time as "Brownie
"
1. Import `BreadBasket_DMS.csv` into a dataframe (See Q1)
2. Extract dates in `YYYY-MM-DD` format from the `Date` column and times in `hh:mm:ss` format from the `Time` column (See Q1)
3. Make list of Brownie Trans and Dates and Time
4. Filter Brownie from list
5. Join both

In [53]:
brown_trans = BreadBasket_DMV.filter(col("Item") == "Brownie").select(col("Date").alias("D"),col("Time").alias("T"),"Item").sort("D","T")
brown_trans.toPandas()



Unnamed: 0,D,T,Item
0,2016-11-03,01:02:37,Brownie
1,2016-11-03,01:19:57,Brownie
2,2016-11-03,02:26:27,Brownie
3,2016-11-03,03:55:46,Brownie
4,2016-11-03,04:06:19,Brownie
...,...,...,...
374,2017-04-08,02:24:42,Brownie
375,2017-04-08,03:40:26,Brownie
376,2017-04-08,04:14:28,Brownie
377,2017-04-08,10:33:51,Brownie


In [54]:
b = BreadBasket_DMV.filter(col("Item") != "Brownie")
b.toPandas()

Unnamed: 0,Date,Time,Transaction,Item
0,2016-10-30,09:58:11,1,Bread
1,2016-10-30,10:05:34,2,Scandinavian
2,2016-10-30,10:05:34,2,Scandinavian
3,2016-10-30,10:07:57,3,Hot chocolate
4,2016-10-30,10:07:57,3,Jam
...,...,...,...,...
20123,2017-04-09,02:32:58,9682,Coffee
20124,2017-04-09,02:32:58,9682,Tea
20125,2017-04-09,02:57:06,9683,Coffee
20126,2017-04-09,02:57:06,9683,Pastry


In [57]:
JoinExpression = (b["Date"] == brown_trans["D"]) & (b["Time"] == brown_trans["T"])
q2 = b.join(brown_trans,JoinExpression, "left_semi").sort("Date","Time")
q2.toPandas()

Unnamed: 0,Date,Time,Transaction,Item
0,2016-11-03,01:02:37,391,Sandwich
1,2016-11-03,01:02:37,391,Coffee
2,2016-11-03,01:19:57,392,Pastry
3,2016-11-03,01:19:57,392,Focaccia
4,2016-11-03,01:19:57,392,Farm House
...,...,...,...,...
833,2017-04-08,10:33:51,9574,Coffee
834,2017-04-08,10:33:51,9574,Spanish Brunch
835,2017-04-08,10:33:51,9574,The Nomad
836,2017-04-08,10:33:51,9574,Cookies


In [None]:
q2.groupBy("Date","Time").

In [28]:
brown_trans = BreadBasket_DMV.filter(col("Item") == "Brownie").select(col("Date").alias("D"),col("Time").alias("T")).sort("D","T")
brown_trans.count()

379

In [29]:
q2.filter(col("Item") != "Brownie").groupBy("Date","Time").max("Transaction").toPandas()

Unnamed: 0,Date,Time,max(Transaction)
0,2016-11-23,01:55:58,1917
1,2017-02-25,12:22:52,7178
2,2016-11-04,07:02:45,522
3,2017-02-22,02:53:41,7018
4,2016-11-05,09:26:27,539
...,...,...,...
340,2016-11-11,04:20:18,1090
341,2016-11-12,10:23:48,1121
342,2016-11-24,11:05:57,1962
343,2016-12-10,10:48:51,2912


In [30]:
1240-379

861

In [52]:
from pyspark.sql.functions import count, desc
brown_trans.sort("D","T").show(379)


+----------+--------+-------+
|         D|       T|   Item|
+----------+--------+-------+
|2016-11-03|01:02:37|Brownie|
|2016-11-03|01:19:57|Brownie|
|2016-11-03|02:26:27|Brownie|
|2016-11-03|03:55:46|Brownie|
|2016-11-03|04:06:19|Brownie|
|2016-11-03|10:39:12|Brownie|
|2016-11-03|12:05:25|Brownie|
|2016-11-03|12:07:24|Brownie|
|2016-11-04|01:04:40|Brownie|
|2016-11-04|01:15:24|Brownie|
|2016-11-04|02:17:39|Brownie|
|2016-11-04|02:51:22|Brownie|
|2016-11-04|03:35:06|Brownie|
|2016-11-04|03:39:57|Brownie|
|2016-11-04|03:41:06|Brownie|
|2016-11-04|03:51:24|Brownie|
|2016-11-04|04:14:01|Brownie|
|2016-11-04|05:58:43|Brownie|
|2016-11-04|07:02:45|Brownie|
|2016-11-04|09:55:18|Brownie|
|2016-11-04|10:25:24|Brownie|
|2016-11-04|11:12:22|Brownie|
|2016-11-04|11:17:35|Brownie|
|2016-11-04|11:33:11|Brownie|
|2016-11-04|11:36:07|Brownie|
|2016-11-05|02:03:14|Brownie|
|2016-11-05|02:12:41|Brownie|
|2016-11-05|03:14:47|Brownie|
|2016-11-05|03:17:20|Brownie|
|2016-11-05|03:24:18|Brownie|
|2016-11-0