# IC New Variables Derivation

## This notebook contains the following topics:
### 01. Importing Libraries and Files
### 02. If-Statements with User-Defined Functions
### 03. If-Statements with the loc() Function (Creating new variable 'price_range_loc')
### 04. If-Statements with For-Loops (Creating new variable 'busiest_day' and 'busiest_days')
#### a) Creating conditions for the new column 'busiest_period_of_day' with loc() function
### 05. Exporting Dataframe with New Variables

# 01. Importing Libraries and Files

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create folder path
path = r'C:\Users\ezrela\Documents\CareerFoundry\DD-MM-2020 Instacart Basket Analysis'

In [3]:
# Import orders_products_merged.pkl as ords_prods_merged
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_updated.pkl'))

In [41]:
# Checking output
ords_prods_merged['prices'].describe()

count    3.240486e+07
mean     1.198023e+01
std      4.956554e+02
min      1.000000e+00
25%      4.200000e+00
50%      7.400000e+00
75%      1.130000e+01
max      9.999900e+04
Name: prices, dtype: float64

# 02. If-Statements with User-Defined Functions

In [5]:
# Creating a subset for dataframe
df = ords_prods_merged[:1000000]

In [6]:
df.shape

(1000000, 17)

In [7]:
# Defining function
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [8]:
# Applying function on subset df
df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [9]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

In [10]:
# Checking the max value in 'price_range'
df['prices'].max()

14.8

# 03. If-Statements with the loc() Function

##### Running the loc() function on the subset df

In [11]:
# Creating conditions with the loc() function
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [12]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [13]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [14]:
# Checking frequency of newly created variable 'price_range_loc'
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

##### Running the loc() function on the entire ords_prods_merged df

In [15]:
# Creating conditions with the loc() function
ords_prods_merged.loc[ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [16]:
ords_prods_merged.loc[(ords_prods_merged['prices'] <= 15) & (ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [17]:
ords_prods_merged.loc[ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [18]:
# Checking frequency of newly created variable 'price_range_loc'
ords_prods_merged['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

# 04. If-Statements with For-Loops

In [19]:
ords_prods_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 18 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   order_day_of_week       int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float16 
 6   new_user_order          bool    
 7   product_id              int32   
 8   add_to_cart_order       int32   
 9   reordered               int8    
 10  _merge 1                category
 11  Unnamed: 0              int64   
 12  product_name            object  
 13  aisle_id                int32   
 14  department_id           int64   
 15  prices                  float64 
 16  _merge                  category
 17  price_range_loc         object  
dtypes: bool(1), category(2), float16(1), float64(1), int32(5), int64(2), int8(4), object(2)
memory usage: 2.1+ GB


In [20]:
# Printing the frequency of the “order_day_of_week” column
ords_prods_merged['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [21]:
# Creating conditional for-loop code
result = []

for value in ords_prods_merged["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [22]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [23]:
# Creating new column on ords_prods_merged for 'result'
ords_prods_merged['busiest_day'] = result

In [24]:
ords_prods_merged['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

### Suppose your clients have changed their minds about the labels you created in your “busiest_day” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days. Create a new column for this using a suitable method

In [25]:
# Creating conditional for-loop code for 2 busiest days and 2 slowest days
result = []

for value in ords_prods_merged["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day 1")
  elif value == 1:
    result.append("Busiest day 2")
  elif value == 4:
    result.append("Least busy 1")
  elif value == 3:
    result.append("Least busy 2")
  else:
    result.append("Regularly busy")

In [26]:
result

['Regularly busy',
 'Least busy 2',
 'Least busy 2',
 'Least busy 1',
 'Least busy 1',
 'Regularly busy',
 'Busiest day 2',
 'Busiest day 2',
 'Busiest day 2',
 'Least busy 1',
 'Busiest day 2',
 'Regularly busy',
 'Regularly busy',
 'Busiest day 2',
 'Busiest day 2',
 'Regularly busy',
 'Regularly busy',
 'Least busy 1',
 'Least busy 2',
 'Least busy 2',
 'Least busy 2',
 'Least busy 1',
 'Least busy 1',
 'Busiest day 2',
 'Busiest day 2',
 'Busiest day 2',
 'Regularly busy',
 'Regularly busy',
 'Busiest day 2',
 'Regularly busy',
 'Regularly busy',
 'Busiest day 2',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy 2',
 'Regularly busy',
 'Least busy 2',
 'Busiest day 2',
 'Busiest day 2',
 'Regularly busy',
 'Least busy 2',
 'Least busy 2',
 'Regularly busy',
 'Regularly busy',
 'Busiest day 2',
 'Busiest day 2',
 'Regularly busy',
 'Busiest day 2',
 'Busiest day 2',
 'Least busy 1',
 'Regularly busy',
 'Busiest day 1',
 'Busiest day 2',
 'Busiest day 2',
 'Busie

In [27]:
# Creating new column on ords_prods_merged for 'result'
ords_prods_merged['busiest_days'] = result

In [28]:
ords_prods_merged.head()

Unnamed: 0.1,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_user_order,product_id,add_to_cart_order,reordered,_merge 1,Unnamed: 0,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days
0,2539329,1,1,2,8,,True,196,1,0,both,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy
1,2398795,1,2,3,7,15.0,False,196,1,1,both,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy 2
2,473747,1,3,3,12,21.0,False,196,1,1,both,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy 2
3,2254736,1,4,4,7,29.0,False,196,1,1,both,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy 1
4,431534,1,5,4,15,28.0,False,196,1,1,both,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy 1


In [29]:
ords_prods_merged['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12916111
Busiest day 1      6204182
Busiest day 2      5660230
Least busy 2       3840534
Least busy 1       3783802
Name: count, dtype: int64

In [30]:
ords_prods_merged['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

##### The difference between the columns 'busiest_day' and 'busiest_days' is in the number of 'Regularly busy', since additional labels were created for busiest day 2 and least busy day 2 from this number ('Regularly busy').

### When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.” Create a new column containing these labels called “busiest_period_of_day.”

In [31]:
# Printing the frequency of the “order_hour_of_day” column
ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

#### Creating conditions for the new column 'busiest_period_of_day' with loc() function

In [32]:
ords_prods_merged.loc[(ords_prods_merged['order_hour_of_day'] >= 9) & (ords_prods_merged['order_hour_of_day'] <= 16), 'busiest_period_of_day'] = 'Most orders'

In [33]:
ords_prods_merged.loc[(ords_prods_merged['order_hour_of_day'] > 16) & (ords_prods_merged['order_hour_of_day'] <= 22), 'busiest_period_of_day'] = 'Average orders'

In [34]:
ords_prods_merged.loc[ords_prods_merged['order_hour_of_day'] == 7,'busiest_period_of_day'] = 'Average orders'

In [35]:
ords_prods_merged.loc[ords_prods_merged['order_hour_of_day'] == 8,'busiest_period_of_day'] = 'Average orders'

In [36]:
ords_prods_merged.loc[ords_prods_merged['order_hour_of_day'] < 7, 'busiest_period_of_day'] = 'Fewest orders'

In [37]:
ords_prods_merged.loc[ords_prods_merged['order_hour_of_day'] == 23, 'busiest_period_of_day'] = 'Fewest orders'

### Print the frequency for this new column

In [38]:
# Frequency of new column 'busiest_period_of_day'
ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

In [39]:
ords_prods_merged.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'new_user_order',
       'product_id', 'add_to_cart_order', 'reordered', '_merge 1',
       'Unnamed: 0', 'product_name', 'aisle_id', 'department_id', 'prices',
       '_merge', 'price_range_loc', 'busiest_day', 'busiest_days',
       'busiest_period_of_day'],
      dtype='object')

# 05. Exporting Dataframe with New Variables

In [42]:
# Export ords_prods_merged as orders_products_updated.pkl
ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_updated_4_7.pkl'))