# Import optimus and pandas

In [110]:
from optimus import Optimus
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
op = Optimus()

# Read the data

In [21]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
    
chipo_pd = pd.read_csv(url, sep = '\t')

In [25]:
chipo_pd.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

### We need to to this to be able to read the data from Spark

In [28]:
chipo_pd[['item_name', 'choice_description', 'item_price']] = chipo_pd[['item_name', 'choice_description', 'item_price']].astype(str)

# Transform data from Pandas to Optimus (Spark)

In [30]:
chipo = op.spark.createDataFrame(chipo_pd)

#  See the first 10 entries

In [33]:
chipo.table(10)

order_id  1 (bigint)  nullable,quantity  2 (bigint)  nullable,item_name  3 (string)  nullable,choice_description  4 (string)  nullable,item_price  5 (string)  nullable
1,1,Chips⸱and⸱Fresh⸱Tomato⸱Salsa,,$2.39⸱
1,1,Izze,[Clementine],$3.39⸱
1,1,Nantucket⸱Nectar,[Apple],$3.39⸱
1,1,Chips⸱and⸱Tomatillo-Green⸱Chili⸱Salsa,,$2.39⸱
2,2,Chicken⸱Bowl,"[Tomatillo-Red⸱Chili⸱Salsa⸱(Hot),⸱[Black⸱Beans,⸱Rice,⸱Cheese,⸱Sour⸱Cream]]",$16.98⸱
3,1,Chicken⸱Bowl,"[Fresh⸱Tomato⸱Salsa⸱(Mild),⸱[Rice,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]",$10.98⸱
3,1,Side⸱of⸱Chips,,$1.69⸱
4,1,Steak⸱Burrito,"[Tomatillo⸱Red⸱Chili⸱Salsa,⸱[Fajita⸱Vegetables,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]",$11.75⸱
4,1,Steak⸱Soft⸱Tacos,"[Tomatillo⸱Green⸱Chili⸱Salsa,⸱[Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]",$9.25⸱
5,1,Steak⸱Burrito,"[Fresh⸱Tomato⸱Salsa,⸱[Rice,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]",$9.25⸱


# What is the number of observations and columns in the dataset?

In [34]:
chipo.count()

4622

In [36]:
op.profiler.dataset_info(chipo)

{'cols_count': 5,
 'rows_count': 4622,
 'missing_count': '0.0%',
 'size': '-1 Bytes'}

# Print the name of all the columns

In [37]:
chipo.columns

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

**NOTE: Spark dataframes are not indexed.**

# Which was the most-ordered item and how many items were ordered?

In [61]:
# Here we are renaming the column sum(quantity) to quantity with Optimus function rename inside of cols
(chipo.groupby("item_name")
      .sum("quantity")
      .cols.rename("sum(quantity)", "quantity")
      .sort(desc("quantity"))
      .table(1))

item_name  1 (string)  nullable,quantity  2 (bigint)  nullable
Chicken⸱Bowl,761


# What was the most ordered item in the choice_description column

In [65]:
(chipo.groupby("choice_description")
      .sum("quantity")
      .cols.rename("sum(quantity)", "quantity")
      .sort(desc("quantity"))
      .table(1))

choice_description  1 (string)  nullable,quantity  2 (bigint)  nullable
,1382


Here we have a problem, is showing that nan was the most order item from `choice_desccription`. Let's solve that:

In [77]:
# First we are transforming "nan" strings to real nulls, and then droping them
(chipo.cols.replace("choice_description","nan")
      .dropna()
      .groupby("choice_description")
      .sum("quantity")
      .cols.rename("sum(quantity)", "quantity")
      .sort(desc("quantity"))
      .table(1))

choice_description  1 (string)  nullable,quantity  2 (bigint)  nullable
[Diet⸱Coke],159


# How many items were orderd in total?

In [80]:
chipo.cols.sum("quantity")

4972

# Turn the item price into a float

In [81]:
chipo.dtypes

[('order_id', 'bigint'),
 ('quantity', 'bigint'),
 ('item_name', 'string'),
 ('choice_description', 'string'),
 ('item_price', 'string')]

In [88]:
## Let's see the format of the price
chipo.table(1)

order_id  1 (bigint)  nullable,quantity  2 (bigint)  nullable,item_name  3 (string)  nullable,choice_description  4 (string)  nullable,item_price  5 (string)  nullable
1,1,Chips⸱and⸱Fresh⸱Tomato⸱Salsa,,$2.39⸱


In [94]:
# Use substr (like in SQL) to get from the first numer to the end and then cast it
chipo = chipo.withColumn("item_price", chipo.item_price.substr(2,10).cast("float"))

In [96]:
# Let's see our data now
chipo.table(2)

order_id  1 (bigint)  nullable,quantity  2 (bigint)  nullable,item_name  3 (string)  nullable,choice_description  4 (string)  nullable,item_price  5 (float)  nullable
1,1,Chips⸱and⸱Fresh⸱Tomato⸱Salsa,,2.390000104904175
1,1,Izze,[Clementine],3.390000104904175


# How much was the revenue for the period in the dataset?

In [111]:
# The function mul takes two or more columns and multiples them
# The function sum will sum the values in a specific column
revenue = (chipo.cols.mul(columns=["quantity", "item_price"])
     .cols.sum("mul"))
print('Revenue was: $' + str(np.round(revenue,2)))

Revenue was: $39237.02


# How many orders were made in the period?

In [117]:
chipo.select("order_id").distinct().count()

1834

# What is the average revenue amount per order?

In [132]:
(chipo.cols.mul(columns=["quantity", "item_price"])
     .cols.rename("mul", "revenue")
     .groupby("order_id").sum("revenue")
     .cols.rename("sum(revenue)", "revenue")
     .cols.mean("revenue"))

21.39423

# How many different items are sold?

In [133]:
chipo.select("item_name").distinct().count()

50