# Ex2 - Getting and Knowing your Data

--> dataset and materials: https://github.com/Alw1tz

### Step 1. Import the necessary libraries

In [2]:
import numpy as np 
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import os

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [3]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv' # online data

chipo = pd.read_csv(url, sep = '\t')

### Step 4. See the first 10 entries

In [4]:
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [5]:
# Solution 1
print(len(chipo)) 

# Solution 1.1: shape[0-1] where 0: rows and 1: columns
chipo.shape[0]


4622


4622

In [10]:
# Solution 2
chipo.info()


order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

### Step 6. What is the number of columns in the dataset?

In [12]:
chipo.shape[1]

5

### Step 7. Print the name of all the columns.

In [22]:
# 1st form 
chipo.columns

# 2nd form w/ type
chipo.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

### Step 8. How is the dataset indexed?

In [23]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most-ordered item? 

In [73]:
# The short way
# x = chipo.groupby('item_name').sum()
# x = x.sort_values(['quantity'], ascending = False)
# x.head(1)

# The large way
x = chipo.groupby('item_name') # group only item_name 
x = x.sum(numeric_only=True) # add the amount of every item_name
x = x.sort_values(['quantity'], ascending = False) # order by quantity desc
x.head(1) # print the first item and theirs attributes


Unnamed: 0_level_0,order_id,quantity
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicken Bowl,713926,761


### Step 9.1. Which was the most-ordered item and which was de 4th? (only the name)

In [68]:
# The most ordered item in the list
print(f'El item que más se ordenó fue: {x.index[0]}') 

# The 4th most ordered item in the list
print(f'El 4to item que más se ordenó fue: {x.index[3]}')

El item que más se ordenó fue: Chicken Bowl
El 4to item que más se ordenó fue: Steak Burrito


In [None]:
# Visualization
x.mean().plot(kind='bar')
x.hist()
x.boxplot()

### Step 10. For the most-ordered item, how many items were ordered?

In [72]:
# The short way
x = chipo.groupby('item_name').sum(numeric_only=True)
x = x.sort_values(['quantity'], ascending = False)
x = x.iloc[0]
x['quantity']

761

### Step 11. What was the most ordered item in the choice_description column?

In [89]:
# A specific answer
z = chipo.groupby('choice_description').sum(numeric_only=True)
z = z.sort_values(['quantity'], ascending=False)
z.index[0]

# General answer
z = chipo.groupby('choice_description').sum(numeric_only=True)
z = z.sort_values(['quantity'], ascending=False)
z.head(1)

Unnamed: 0_level_0,order_id,quantity
choice_description,Unnamed: 1_level_1,Unnamed: 2_level_1
[Diet Coke],123455,159


### Step 12. How many items were orderd in total?

In [90]:
total_items = chipo.quantity.sum(numeric_only=True)
total_items

4972

### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [98]:
# check the type
chipo.item_price.dtype

dtype('float64')

#### Step 13.b. Create a lambda function and change the type of item price

In [95]:
# manual conversion 
zipper = lambda x: float(x[1:-1])
chipo.item_price = chipo.item_price.apply(zipper)

#### Step 13.b2. Functional programming -> Easy way

In [101]:
# easy way to do this
chipo['item_price'] = chipo['item_price'].astype(float)

#### Step 13.c. Check the item price type

In [103]:
chipo.item_price.dtype

dtype('float64')

### Step 14. How much was the revenue for the period in the dataset?

In [109]:
# equation : quantity * item_price
revenue = (chipo['quantity'] * chipo['item_price']).sum()
print(f'Las ganacias son: ${np.round(revenue, 2)}') # with 2 decimals


Las ganacias son: $39237.02


### Step 15. How many orders were made in the period?

In [114]:
orders = chipo['order_id'].value_counts().count() # count is more util in this case bc added only the not null
orders

1834

### Step 16. What is the average revenue amount per order?

In [119]:
# Solution 1
chipo['revenue'] = chipo['quantity'] * chipo['item_price']
average_revenue = chipo.groupby(by=['order_id']).sum(numeric_only=True)
average_revenue = average_revenue.mean()['revenue']
final = np.round(average_revenue, 4)
final


21.3942

In [122]:
# Solution 2: easy way
easy_final = chipo.groupby(by=['order_id']).sum(numeric_only=True).mean()['revenue']
np.round(easy_final, 4)


21.3942

### Step 17. How many different items are sold?

In [123]:
chipo['item_name'].value_counts().count()

50

### Step 18. Which are the different items sold?

In [125]:
chipo['item_name'].value_counts()

Chicken Bowl           726
Chicken Burrito        553
Chips and Guacamole    479
Steak Burrito          368
Canned Soft Drink      301
Name: item_name, dtype: int64

### Step 19. Which are the top 10 different items sold?

In [126]:
chipo['item_name'].value_counts().head(10)

Chicken Bowl                    726
Chicken Burrito                 553
Chips and Guacamole             479
Steak Burrito                   368
Canned Soft Drink               301
Steak Bowl                      211
Chips                           211
Bottled Water                   162
Chicken Soft Tacos              115
Chips and Fresh Tomato Salsa    110
Name: item_name, dtype: int64