In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# Your path
path = r'/Users/josephadamski/Instacart Basket Analysis'

# Import ords_prods_merge
ords_prods_merge = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_merge.pkl'))

# Check shape
print(f"Shape: {ords_prods_merge.shape}")
ords_prods_merge.head()

Shape: (32404859, 15)


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order_flag,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both


In [2]:
# Create price_label using loc()
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_label'] = 'High-range product'
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_label'] = 'Mid-range product'  
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_label'] = 'Low-range product'

# Check values
ords_prods_merge['price_label'].value_counts(dropna=False)

price_label
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

In [3]:
# Check frequency
ords_prods_merge['orders_day_of_week'].value_counts().sort_index()

orders_day_of_week
0    6204182
1    5660230
2    4213830
3    3840534
4    3783802
5    4205791
6    4496490
Name: count, dtype: int64

In [4]:
# Create busiest_day column - 0 is busiest, 4 is slowest
result = []

for value in ords_prods_merge["orders_day_of_week"]:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")
        
ords_prods_merge['busiest_day'] = result

# Check
ords_prods_merge['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

In [5]:
# Check which days are top 2 and bottom 2
ords_prods_merge['orders_day_of_week'].value_counts().sort_values(ascending=False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [6]:
# Create busiest_days - 0,1 are busiest, 3,4 are slowest
result = []

for value in ords_prods_merge["orders_day_of_week"]:
    if value == 0 or value == 1:
        result.append("Busiest days")
    elif value == 3 or value == 4:
        result.append("Slowest days")
    else:
        result.append("Regularly busy")
        
ords_prods_merge['busiest_days'] = result

# Check
ords_prods_merge['busiest_days'].value_counts(dropna=False)

busiest_days
Regularly busy    12916111
Busiest days      11864412
Slowest days       7624336
Name: count, dtype: int64

### Observations on busiest_days Column

The busiest_days column shows:
- Days 0 and 1 (Saturday and Sunday) labeled as "Busiest days" 
- Days 3 and 4 (Tuesday and Wednesday) labeled as "Slowest days"
- Days 2, 5, 6 labeled as "Regularly busy"

This makes sense as weekends have higher order volumes.

In [7]:
# Check hourly frequency
ords_prods_merge['order_hour_of_day'].value_counts().sort_index()

order_hour_of_day
0      218769
1      115700
2       69375
3       51281
4       53242
5       87961
6      290493
7      891054
8     1718118
9     2454203
10    2761760
11    2736140
12    2618532
13    2660954
14    2689136
15    2662144
16    2535202
17    2087654
18    1636502
19    1258305
20     976156
21     795637
22     634225
23     402316
Name: count, dtype: int64

In [8]:
# Create busiest_period_of_day
ords_prods_merge.loc[(ords_prods_merge['order_hour_of_day'] >= 10) & (ords_prods_merge['order_hour_of_day'] <= 16), 'busiest_period_of_day'] = 'Most orders'

ords_prods_merge.loc[((ords_prods_merge['order_hour_of_day'] >= 7) & (ords_prods_merge['order_hour_of_day'] < 10)) | ((ords_prods_merge['order_hour_of_day'] > 16) & (ords_prods_merge['order_hour_of_day'] <= 23)), 'busiest_period_of_day'] = 'Average orders'

ords_prods_merge.loc[(ords_prods_merge['order_hour_of_day'] >= 0) & (ords_prods_merge['order_hour_of_day'] < 7), 'busiest_period_of_day'] = 'Fewest orders'

In [9]:
# Print frequency
ords_prods_merge['busiest_period_of_day'].value_counts(dropna=False)

busiest_period_of_day
Most orders       18663868
Average orders    12854170
Fewest orders       886821
Name: count, dtype: int64

In [11]:
# Export updated dataframe
ords_prods_merge.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_merge.pkl'))

print("Dataframe exported successfully!")
print(f"Final shape: {ords_prods_merge.shape}")

Dataframe exported successfully!
Final shape: (32404859, 19)
