In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\andd0\Documents\InstaCart Basket Analysis'

In [3]:
# Importing the ords_prods_merge
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

In [4]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,add_to_cart_order,reordered,_merge,merge_status
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,No,5,0,both,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,No,1,1,both,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,No,20,0,both,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,Yes,10,0,both,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,No,11,1,both,both


In [5]:
# To avoid memory issues/processing power, we'll work with a subset
# This code means that the subset will include everything from 0 up to the number placed after the colon
df = ords_prods_merge[:1000000]

In [6]:
df.shape

(1000000, 16)

#### If-else statements

In [8]:
# Creating the 'price_label' function to filter products by price range
def price_label(row):

    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range'
    else: return 'Not enough data'

#### 'price_range' column

In [10]:
# Using the function! The code below includes this: df['price_range']. This means that a new column will be added to the
# dataframe. Below that column, we'll find the label for each product ('low-range', 'mid-range', etc.)
# To the right side of the = I'm applying the price_label function on the 'df' dataframe. Axis = 1 means the function will be applied to 
# all the rows. Axis = 0 would mean that the function should be applied to all columns

df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [11]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    659373
Low-range product    331283
High-range             9344
Name: count, dtype: int64

In [12]:
df['prices'].max()

24.5

#### Note:

In [14]:
# The outcome above is not the same as the outcome in the material.

# What can explain the differences?

# Tutor's explanation: 

# "For the subset of 1 million records, Python doesn't have a way to know which 1 million records you want to keep. 

# It just picks some set of 1 million, which may not be the same 1 million records from the example."

#### If-Statements with the loc() Function

In [16]:
# When using the loc() function, it's better to use individual cells for each condition
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [17]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [18]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [19]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     659373
Low-range product     331283
High-range product      9344
Name: count, dtype: int64

#### Applying the loc() function to the entire dataframe - not the subset

In [21]:
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [22]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [23]:
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [24]:
ords_prods_merge['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21890146
Low-range product     10131511
High-range product      412555
Name: count, dtype: int64

In [25]:
ords_prods_merge['prices'].max()

25.0

#### User-defined function vs. loc()

In [27]:
# The loc() method runs much faster

# The loc() function applies the conditional filters before searching through the dataframe

# The user-defined function searches through the entire dataframe and then determines where to set the filters

# The loc() function allows tow filter the entire dataframe rather than just a subset

#### If-statements with For loops

In [29]:
# Let's figure out the rank of busiest days - that is, the days when the most orders take place
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6209632
1    5665830
6    4500246
2    4217766
5    4209449
3    3844096
4    3787193
Name: count, dtype: int64

In [30]:
# 0 - Saturday is the busiest day (the day when the most orders take place)

# 4 - Wednesday is the least busy day

#### How to use this info?

In [32]:
# Create a new column, “busiest day,” that will contain one of three different values: “Busiest day,” “Least busy,” and “Regularly busy.”

# How? 

# Creating a for-loop that will run through every row in the “orders_day_of_week” column, 

# compare values with the busiest and slowest days, and assign it the corresponding string value.

#### For-loop

In [34]:
result = []

for value in ords_prods_merge["orders_day_of_week"]:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [35]:
# This for loop will check every single value under the 'orders_day_of_week'.

# Then, it will apply the conditions above. After that, it will push the condition to the 'result = []' list 

#### Outcome:

In [37]:
result

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Bus

#### Creating a column (busiest_day) within the dataframe and setting it equal to result

In [39]:
ords_prods_merge['busiest_day'] = result

In [40]:
ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22437387
Busiest day        6209632
Least busy         3787193
Name: count, dtype: int64

## Task 4.7

#### Step No. 2: Suppose your clients have changed their minds about the labels you created in your “busiest_day” column.

In [43]:
# Now, they want “Busiest day” to become “Busiest days” (plural). 

# This label should correspond with the two busiest days of the week as opposed to the single busiest day. 

# At the same time, they’d also like to know the two slowest days. Create a new column for this using a suitable method.

In [44]:
results = []

for value in ords_prods_merge["orders_day_of_week"]:
    if value in (0, 1):
        results.append("Busiest days")
    elif value in(3, 4):
        results.append("Slowest days")
    else:
        results.append("Regular days")

#### Explaining the code:

In [46]:
# I already have the info about the busiest days: Saturday (0) and Sunday (1) are the busiest days

# Whereas Wednesday (3) and Thursday (4) are the slowest days

# So, all I need is to add a logical operator (or) to the for loop we used a few lines above

# On every loop, the line 'if value in (0, 1)' will check if the day is either Saturday (0) OR Sunday (1); if it is, then it will add 'Busiest days' to
# the 'results = []' list.

# If it's not Saturday OR Sunday, the second line (elif value in(3, 4:) will execute and check whether the day is either Wednesday (3) or
# Thursday (4). If so, 'Slowest days' will be pushed to the 'results = []' list.

# If none of the two previous conditions is met, then the the 'else' will execute, meaning that 'Regular days' will be pushed to the results list.

In [47]:
results

['Regular days',
 'Regular days',
 'Busiest days',
 'Slowest days',
 'Slowest days',
 'Busiest days',
 'Regular days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Regular days',
 'Slowest days',
 'Slowest days',
 'Regular days',
 'Slowest days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Busiest days',
 'Busiest days',
 'Regular days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regular days',
 'Busiest days',
 'Busiest days',
 'Regular days',
 'Regular days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regular days',
 'Regular days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regular days',
 'Busiest days',
 'Regular days',
 'Busiest days',
 'Busiest days',
 'Regular days',
 'Busiest days

#### Creating the 'busiest_days' column

In [49]:
ords_prods_merge['busiest_days'] = results

#### Step No. 3: Check the values of this new column for accuracy. Note any observations in markdown format.

In [51]:
ords_prods_merge['busiest_days'].value_counts(dropna = False)

busiest_days
Regular days    12927461
Busiest days    11875462
Slowest days     7631289
Name: count, dtype: int64

In [52]:
# The sum of the above values is equal to = 32,434,212

#### Step No. 4. The senior technical officer at Instacart wants you to identify the busiest hours of the day.

In [54]:
# Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.” 

# Create a new column containing these labels called “busiest_period_of_day.”

#### First, let's get a list from the busiest hour to the least busy one

In [56]:
ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2764390
11    2738585
14    2691563
15    2664522
13    2663272
12    2620800
16    2537469
9     2456661
17    2089452
8     1719952
18    1637922
19    1259382
20     977017
7      891928
21     796362
22     634737
23     402612
6      290770
0      218942
1      115786
5       88057
2       69431
4       53283
3       51317
Name: count, dtype: int64

#### Most orders = first 8 hours; Fewest orders = last 8 hours; Average orders = those in the middle

In [58]:
results_hours = []

for value in ords_prods_merge["order_hour_of_day"]:
    if value in (9, 10, 11, 12, 13, 14, 15, 16):
        results_hours.append("Most orders")
    elif value in (7, 8, 17, 18, 19, 20, 21, 22):
        results_hours.append("Average orders")
    else:
        results_hours.append("Fewest orders")

#### Creating the column 'busiest_period_of_day' and assigning to it the values stored in 'results_hours'

In [60]:
ords_prods_merge['busiest_period_of_day'] = results_hours

#### Step No. 5: Print the frequency for this new column.

In [62]:
ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21137262
Average orders    10006752
Fewest orders      1290198
Name: count, dtype: int64

In [63]:
# The sum of the above values is equal to = 32,434,212

In [64]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,add_to_cart_order,reordered,_merge,merge_status,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,No,5,0,both,both,Mid-range product,Regularly busy,Regular days,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,No,1,1,both,both,Mid-range product,Regularly busy,Regular days,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,No,20,0,both,both,Mid-range product,Busiest day,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,Yes,10,0,both,both,Mid-range product,Regularly busy,Slowest days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,No,11,1,both,both,Mid-range product,Least busy,Slowest days,Average orders


In [65]:
ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge_extra_columns_T4_7.pkl'))