# Import optimus and pandas

In [1]:
from optimus import Optimus
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
op = Optimus()

# Read the data and transform to Spark dataframe

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
chipo_pd = pd.read_csv(url, sep = '\t')
chipo_pd[['item_name', 'choice_description', 'item_price']] = chipo_pd[['item_name', 'choice_description', 'item_price']].astype(str)

In [3]:
chipo = op.spark.createDataFrame(chipo_pd)

# See the data

In [4]:
chipo.table(10)

order_id  1 (bigint)  nullable,quantity  2 (bigint)  nullable,item_name  3 (string)  nullable,choice_description  4 (string)  nullable,item_price  5 (string)  nullable
1,1,Chips⸱and⸱Fresh⸱Tomato⸱Salsa,,$2.39⸱
1,1,Izze,[Clementine],$3.39⸱
1,1,Nantucket⸱Nectar,[Apple],$3.39⸱
1,1,Chips⸱and⸱Tomatillo-Green⸱Chili⸱Salsa,,$2.39⸱
2,2,Chicken⸱Bowl,"[Tomatillo-Red⸱Chili⸱Salsa⸱(Hot),⸱[Black⸱Beans,⸱Rice,⸱Cheese,⸱Sour⸱Cream]]",$16.98⸱
3,1,Chicken⸱Bowl,"[Fresh⸱Tomato⸱Salsa⸱(Mild),⸱[Rice,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]",$10.98⸱
3,1,Side⸱of⸱Chips,,$1.69⸱
4,1,Steak⸱Burrito,"[Tomatillo⸱Red⸱Chili⸱Salsa,⸱[Fajita⸱Vegetables,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]",$11.75⸱
4,1,Steak⸱Soft⸱Tacos,"[Tomatillo⸱Green⸱Chili⸱Salsa,⸱[Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]",$9.25⸱
5,1,Steak⸱Burrito,"[Fresh⸱Tomato⸱Salsa,⸱[Rice,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]",$9.25⸱


# How many items are more expensive than $10?

In [6]:
# Use substr (like in SQL) to get from the first numer to the end and then cast it
chipo = chipo.withColumn("item_price", chipo.item_price.substr(2,10).cast("float"))
# Delete the duplicates in item_name and quantity
chipo_filtered = chipo.rows.drop_duplicates(['item_name','quantity'])
# Select only the products with quantity equals to 1
chipo_one_prod = chipo_filtered.where("quantity = 1")
# Select products that aare more expensive than $10
chipo_one_prod.where("item_price > 10").show()

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|      75|       1|Barbacoa Crispy T...|[Tomatillo Red Ch...|     11.75|
|      83|       1|   Veggie Salad Bowl|[Fresh Tomato Sal...|     11.25|
|     468|       1| Carnitas Salad Bowl|[Fresh Tomato Sal...|     11.89|
|      19|       1|       Barbacoa Bowl|[Roasted Chili Co...|     11.75|
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|     11.75|
|      26|       1|      Veggie Burrito|[Tomatillo Red Ch...|     11.25|
|     109|       1|       Chicken Salad|[Roasted Chili Co...|     10.98|
|     304|       1|   Veggie Soft Tacos|[Tomatillo Red Ch...|     11.25|
|     501|       1| Barbacoa Salad Bowl|[Fresh Tomato Sal...|     11.89|
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|     10.98|
|      28|       1|         Veggie Bowl|[Fresh Toma

# What is the price of each item?

In [8]:
# Select only the products with quantity equals to 1
chipo_one_prod = chipo_filtered.where("quantity = 1")
#select only the item_name and item_price columns
price_per_item = chipo_one_prod[['item_name', 'item_price']]

In [13]:
price_per_item.sort(desc("item_price")).table()

item_name  1 (string)  nullable,item_price  2 (float)  nullable
Carnitas⸱Salad⸱Bowl,11.890000343322754
Barbacoa⸱Salad⸱Bowl,11.890000343322754
Steak⸱Salad⸱Bowl,11.890000343322754
Steak⸱Burrito,11.75
Barbacoa⸱Crispy⸱Tacos,11.75
Barbacoa⸱Bowl,11.75
Veggie⸱Salad⸱Bowl,11.25
Veggie⸱Burrito,11.25
Veggie⸱Soft⸱Tacos,11.25
Veggie⸱Bowl,11.25


# Sort by the name of the item

In [15]:
chipo.sort("item_name").table()

order_id  1 (bigint)  nullable,quantity  2 (bigint)  nullable,item_name  3 (string)  nullable,choice_description  4 (string)  nullable,item_price  5 (float)  nullable
776,1,6⸱Pack⸱Soft⸱Drink,[Coke],6.489999771118164
306,1,6⸱Pack⸱Soft⸱Drink,[Coke],6.489999771118164
422,1,6⸱Pack⸱Soft⸱Drink,[Sprite],6.489999771118164
182,1,6⸱Pack⸱Soft⸱Drink,[Diet⸱Coke],6.489999771118164
784,1,6⸱Pack⸱Soft⸱Drink,[Diet⸱Coke],6.489999771118164
754,1,6⸱Pack⸱Soft⸱Drink,[Diet⸱Coke],6.489999771118164
264,1,6⸱Pack⸱Soft⸱Drink,[Diet⸱Coke],6.489999771118164
708,1,6⸱Pack⸱Soft⸱Drink,[Coke],6.489999771118164
709,1,6⸱Pack⸱Soft⸱Drink,[Diet⸱Coke],6.489999771118164
363,1,6⸱Pack⸱Soft⸱Drink,[Coke],6.489999771118164


# What was the quantity of the most expensive item ordered?

In [16]:
chipo.sort(desc("item_price")).table(1)

order_id  1 (bigint)  nullable,quantity  2 (bigint)  nullable,item_name  3 (string)  nullable,choice_description  4 (string)  nullable,item_price  5 (float)  nullable
1443,15,Chips⸱and⸱Fresh⸱Tomato⸱Salsa,,44.25


# How many times were a Veggie Salad Bowl ordered?

In [18]:
chipo_salad = chipo.where(chipo.item_name == "Veggie Salad Bowl")
chipo_salad.count()

18

# How many times people orderd more than one Canned Soda?

In [20]:
chipo_drink_steak_bowl = chipo.where((chipo.item_name == "Canned Soda") & (chipo.quantity > 1))
chipo_drink_steak_bowl.count()

20