In [1]:
%autosave 0

Autosave disabled


In [2]:
import numpy as np
import pandas as pd
from sqlalchemy import text, create_engine

from env import get_connection

We can create dataframes from..
- Dictionaries
- SQL queries!

In [3]:
pd.DataFrame({'name': ['adam', 'edwige', 'marc', 'theo'],
              'eyewear': ['glasses', 'none', 'glasses', 'glasses']})

Unnamed: 0,name,eyewear
0,adam,glasses
1,edwige,none
2,marc,glasses
3,theo,glasses


In order to run a SQL query against Codeup's database, we need a number of things:
- A URL with our credentials and desired database
- A SQLAlchemy engine object created on the URL
- A SQL query as a text object
- Run pd.read_sql() with the query and the engine.connect()

In [4]:
url = get_connection('farmers_market')

In [5]:
engine = create_engine(url)

In [6]:
query = '''
        SELECT *
        FROM customer_purchases
        INNER JOIN product ON customer_purchases.product_id = product.product_id
        '''

In [7]:
text_query = text(query)

In [8]:
df = pd.read_sql(text_query, engine.connect())
df.head()

Unnamed: 0,product_id,vendor_id,market_date,customer_id,quantity,cost_to_customer_per_qty,transaction_time,product_id.1,product_name,product_size,product_category_id,product_qty_type
0,1,7,2019-07-03,14,0.99,6.99,0 days 17:32:00,1,Habanero Peppers - Organic,medium,1,lbs
1,1,7,2019-07-03,14,2.18,6.99,0 days 18:23:00,1,Habanero Peppers - Organic,medium,1,lbs
2,1,7,2019-07-03,15,1.53,6.99,0 days 18:41:00,1,Habanero Peppers - Organic,medium,1,lbs
3,1,7,2019-07-03,16,2.02,6.99,0 days 18:18:00,1,Habanero Peppers - Organic,medium,1,lbs
4,1,7,2019-07-03,22,0.66,6.99,0 days 17:34:00,1,Habanero Peppers - Organic,medium,1,lbs


We will use the farmers_market database on Codeup's SQL server.

We will run the following query:

SELECT *  

FROM customer_purchases  

INNER JOIN product ON customer_purchases.product_id = product.product_id

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4221 entries, 0 to 4220
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype          
---  ------                    --------------  -----          
 0   product_id                4221 non-null   int64          
 1   vendor_id                 4221 non-null   int64          
 2   market_date               4221 non-null   object         
 3   customer_id               4221 non-null   int64          
 4   quantity                  4221 non-null   float64        
 5   cost_to_customer_per_qty  4221 non-null   float64        
 6   transaction_time          4221 non-null   timedelta64[ns]
 7   product_id                4221 non-null   int64          
 8   product_name              4221 non-null   object         
 9   product_size              4221 non-null   object         
 10  product_category_id       4221 non-null   int64          
 11  product_qty_type          4221 non-null   object         
dtypes: flo

Boolean masking will not make a triumphant return but it's still relevant!

We can subset our dataframe using .loc and .iloc:
- .loc will use labels to access specific rows/columns
- .iloc will use index positions to access specific rows/columns

In [10]:
df.loc[:, 'vendor_id':'transaction_time'].head()

Unnamed: 0,vendor_id,market_date,customer_id,quantity,cost_to_customer_per_qty,transaction_time
0,7,2019-07-03,14,0.99,6.99,0 days 17:32:00
1,7,2019-07-03,14,2.18,6.99,0 days 18:23:00
2,7,2019-07-03,15,1.53,6.99,0 days 18:41:00
3,7,2019-07-03,16,2.02,6.99,0 days 18:18:00
4,7,2019-07-03,22,0.66,6.99,0 days 17:34:00


In [11]:
df.iloc[5:11, 3:8]

Unnamed: 0,customer_id,quantity,cost_to_customer_per_qty,transaction_time,product_id
5,4,0.27,6.99,0 days 12:20:00,1
6,12,3.6,6.99,0 days 09:33:00,1
7,14,3.04,6.99,0 days 13:05:00,1
8,23,1.49,6.99,0 days 12:26:00,1
9,23,2.56,6.99,0 days 12:46:00,1
10,3,2.48,6.99,0 days 18:40:00,1


In [12]:
df.shape

(4221, 12)

We can use the .agg() method to perform a specified aggregate function on one column!

We can pass in a list of columns to aggregate and/or perform a list of aggregate functions!

In [13]:
df = df.assign(total_cost = df.quantity * df.cost_to_customer_per_qty)
df.head()

Unnamed: 0,product_id,vendor_id,market_date,customer_id,quantity,cost_to_customer_per_qty,transaction_time,product_id.1,product_name,product_size,product_category_id,product_qty_type,total_cost
0,1,7,2019-07-03,14,0.99,6.99,0 days 17:32:00,1,Habanero Peppers - Organic,medium,1,lbs,6.9201
1,1,7,2019-07-03,14,2.18,6.99,0 days 18:23:00,1,Habanero Peppers - Organic,medium,1,lbs,15.2382
2,1,7,2019-07-03,15,1.53,6.99,0 days 18:41:00,1,Habanero Peppers - Organic,medium,1,lbs,10.6947
3,1,7,2019-07-03,16,2.02,6.99,0 days 18:18:00,1,Habanero Peppers - Organic,medium,1,lbs,14.1198
4,1,7,2019-07-03,22,0.66,6.99,0 days 17:34:00,1,Habanero Peppers - Organic,medium,1,lbs,4.6134


In [14]:
df.total_cost.agg(['mean', 'max', 'min'])

mean     16.400648
max     144.000000
min       0.069800
Name: total_cost, dtype: float64

In [15]:
df.columns

Index(['product_id', 'vendor_id', 'market_date', 'customer_id', 'quantity',
       'cost_to_customer_per_qty', 'transaction_time', 'product_id',
       'product_name', 'product_size', 'product_category_id',
       'product_qty_type', 'total_cost'],
      dtype='object')

In [16]:
df[['quantity', 'total_cost']].agg(['mean', 'max', 'min'])

Unnamed: 0,quantity,total_cost
mean,2.858112,16.400648
max,20.0,144.0
min,0.02,0.0698


We can use the .groupby() method to consider the unique values in a column.

We then specify a second column on which to perform some aggregate calculations!

In [17]:
df.groupby('product_name').total_cost.agg('mean')

product_name
Apple Pie                     31.404930
Banana Peppers - Jar          11.890672
Cherry Pie                    31.867826
Habanero Peppers - Organic    16.120788
Jalapeno Peppers - Organic    10.749224
Poblano Peppers - Organic      1.591040
Sweet Corn                     2.540638
Whole Wheat Bread             16.962217
Name: total_cost, dtype: float64

In [18]:
df.groupby('product_name').total_cost.value_counts()

product_name       total_cost
Apple Pie          18.0          287
                   36.0          172
                   54.0           85
                   72.0           16
                   90.0            7
                                ... 
Whole Wheat Bread  19.5          109
                   32.5           84
                   39.0            5
                   45.5            4
                   52.0            1
Name: total_cost, Length: 435, dtype: int64

The .transform() method returns an aggregate calculation, where the result is the same length as the dataframe!

In [19]:
df['avg_prod_cost'] = round(df.groupby('product_name').total_cost.transform('mean'), 2)
df

Unnamed: 0,product_id,vendor_id,market_date,customer_id,quantity,cost_to_customer_per_qty,transaction_time,product_id.1,product_name,product_size,product_category_id,product_qty_type,total_cost,avg_prod_cost
0,1,7,2019-07-03,14,0.99,6.99,0 days 17:32:00,1,Habanero Peppers - Organic,medium,1,lbs,6.9201,16.12
1,1,7,2019-07-03,14,2.18,6.99,0 days 18:23:00,1,Habanero Peppers - Organic,medium,1,lbs,15.2382,16.12
2,1,7,2019-07-03,15,1.53,6.99,0 days 18:41:00,1,Habanero Peppers - Organic,medium,1,lbs,10.6947,16.12
3,1,7,2019-07-03,16,2.02,6.99,0 days 18:18:00,1,Habanero Peppers - Organic,medium,1,lbs,14.1198,16.12
4,1,7,2019-07-03,22,0.66,6.99,0 days 17:34:00,1,Habanero Peppers - Organic,medium,1,lbs,4.6134,16.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4216,16,4,2020-09-30,4,2.00,0.50,0 days 18:48:00,16,Sweet Corn,Ear,1,unit,1.0000,2.54
4217,16,4,2020-09-30,11,1.00,0.50,0 days 16:30:00,16,Sweet Corn,Ear,1,unit,0.5000,2.54
4218,16,4,2020-09-30,16,5.00,0.50,0 days 18:25:00,16,Sweet Corn,Ear,1,unit,2.5000,2.54
4219,16,4,2020-09-30,18,6.00,0.50,0 days 18:28:00,16,Sweet Corn,Ear,1,unit,3.0000,2.54


We can use np.where() to assign values based on a condition!

In [20]:
df['person'] = np.where(df.total_cost > 100, 'big spender', 'normal person')
df.head()

Unnamed: 0,product_id,vendor_id,market_date,customer_id,quantity,cost_to_customer_per_qty,transaction_time,product_id.1,product_name,product_size,product_category_id,product_qty_type,total_cost,avg_prod_cost,person
0,1,7,2019-07-03,14,0.99,6.99,0 days 17:32:00,1,Habanero Peppers - Organic,medium,1,lbs,6.9201,16.12,normal person
1,1,7,2019-07-03,14,2.18,6.99,0 days 18:23:00,1,Habanero Peppers - Organic,medium,1,lbs,15.2382,16.12,normal person
2,1,7,2019-07-03,15,1.53,6.99,0 days 18:41:00,1,Habanero Peppers - Organic,medium,1,lbs,10.6947,16.12,normal person
3,1,7,2019-07-03,16,2.02,6.99,0 days 18:18:00,1,Habanero Peppers - Organic,medium,1,lbs,14.1198,16.12,normal person
4,1,7,2019-07-03,22,0.66,6.99,0 days 17:34:00,1,Habanero Peppers - Organic,medium,1,lbs,4.6134,16.12,normal person


In [21]:
df.person.value_counts()

normal person    4218
big spender         3
Name: person, dtype: int64

In [22]:
df[df.person == 'big spender']

Unnamed: 0,product_id,vendor_id,market_date,customer_id,quantity,cost_to_customer_per_qty,transaction_time,product_id.1,product_name,product_size,product_category_id,product_qty_type,total_cost,avg_prod_cost,person
3017,7,8,2020-05-02,21,6.0,18.0,0 days 11:02:00,7,Apple Pie,"10""",3,unit,108.0,31.4,big spender
3486,8,8,2019-12-18,22,8.0,18.0,0 days 18:15:00,8,Cherry Pie,"10""",3,unit,144.0,31.87,big spender
3503,8,8,2019-12-25,24,6.0,18.0,0 days 17:52:00,8,Cherry Pie,"10""",3,unit,108.0,31.87,big spender
