# Contents
01 Importing Libraries
02 Creating product price classifications
03 03 Creating hierarchy for busiest days
04 Creating hierarchy for busiest hours of the day

# 01 Importing Libraries

In [47]:
# importing libraries
import pandas as pd
import numpy as np
import os

In [48]:
# Folder location string
path = r'C:\Users\ckra9\Documents\CareerFoundry\Instacart Basket Analysis'

In [49]:
# Importing orders and products merged dataframe
df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))

In [50]:
# Importing just the first 1 million rows of orders & products merged dataframe
df = df_ords_prods_merged[:1000000]

In [51]:
df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


# 02 Creating product price classifications

In [52]:
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [53]:
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [54]:
df['price_range'].value_counts()

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

In [55]:
df['prices'].max()

14.8

In [56]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [57]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [58]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [59]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

# 03 Creating hierarchy for busiest days

In [60]:
df_ords_prods_merged['order_dow'].value_counts(dropna = False)

order_dow
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [61]:
# Creating a hierarchy for busy days of the week
result = []

for value in df_ords_prods_merged["order_dow"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [62]:
print(result)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [63]:
df_ords_prods_merged['busiest_day'] = result

In [64]:
df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

# Q2

In [65]:
# Creating a new hierarchy for 2 busiest and slowest days of the week
result_2 = []

for value in df_ords_prods_merged["order_dow"]:
  if value == 0 or value == 1:
    result_2.append("Busiest days")
  elif value == 4 or value == 3:
    result_2.append("Least busy days")
  else:
    result_2.append("Regularly busy days")

In [66]:
# Creating the new columns for busiest and slowest days in the dataframe
df_ords_prods_merged['Busiest_days'] = result_2

In [67]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,busiest_day,Busiest_days
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,9.0,both,Regularly busy,Regularly busy days
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Regularly busy,Least busy days
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Regularly busy,Least busy days
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Least busy,Least busy days
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Least busy,Least busy days


# Q3

In [68]:
# Value counts of new column to check for accuracy
df_ords_prods_merged['Busiest_days'].value_counts(dropna = False)

Busiest_days
Regularly busy days    12916111
Busiest days           11864412
Least busy days         7624336
Name: count, dtype: int64

In [69]:
df_ords_prods_merged.shape

(32404859, 17)

All rows/observations were recorded succesfully. The sum of the groupings has distributed properly after busiest_days column was added.

# 04 Creating hierarchy for busiest hours of the day

# Q4

In [70]:
# Checking for busiest hours
df_ords_prods_merged['order_hour_of_day'].value_counts()

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [71]:
# Creating a hierarchy for busiest hours of the day
result_3 = []

for value in df_ords_prods_merged["order_hour_of_day"]:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    result_3.append("Most orders")
  elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
    result_3.append("Fewest orders")
  else:
    result_3.append("Average orders")

In [76]:
# Adding most orders column to the dataframe
df_ords_prods_merged['Busiest_period_of_day'] = result_3

In [77]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,busiest_day,Busiest_days,Busiest_period_of_day
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,9.0,both,Regularly busy,Regularly busy days,Average orders
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Regularly busy,Least busy days,Average orders
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Regularly busy,Least busy days,Most orders
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Least busy,Least busy days,Average orders
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Least busy,Least busy days,Most orders


# Q5

In [78]:
# Value counts of new column to check for accuracy and frequency count
df_ords_prods_merged['Busiest_period_of_day'].value_counts(dropna = False)

Busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

In [79]:
# Exporting dataframe as pickle file
df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined_new_variables.pkl'))