Exercise 4.10 - Coding Etiquette & Excel Reporting

# Final Report (Task 1-4)

List of contents:

1. Import libraries
2. Import 'orders_products_customers_merged' dataset
3. Remove 'first_name' and 'last_name' columns to address PII data
4. Compare customers behavior across geographic regions
5. Create an exclusion flag for low-activity customers and exclude them from the data
6. Export dataframes

## 1 - Import Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## 2 - Import 'orders_products_customers_merge' dataframe

In [2]:
# Assigning the folder path to a variable
path = r"C:\Users\Toshiba\09-10-2023 Instacart Basket Analysis"

In [3]:
# Importing the dataframe 'orders_products_customers_merge.pkl' with os library

df_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merge.pkl'))

In [4]:
# Apply function to display all columns within the dataframe
pd.set_option('display.max_columns', None)

In [6]:
# Check the output
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,max_order,loyalty_flag,mean_order,spending_flag,median_order,customer_flag,first_name,last_name,gender,state,age,date_joined,num_of_dependants,marital_status,income
0,2539329,1,1,2,8,11.0,196,1,0,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [8]:
# Check the dimensions
df_merged.shape

(32404859, 28)

# Remove 'first_name' and 'last_name' columns to address PII data

In [9]:
# Drop 'first_name' and 'last_name' columns
df_merged1 = df_merged.drop(columns = ['first_name', 'last_name'])

In [12]:
#Check the result
df_merged1.head(5)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,max_order,loyalty_flag,mean_order,spending_flag,median_order,customer_flag,gender,state,age,date_joined,num_of_dependants,marital_status,income
0,2539329,1,1,2,8,11.0,196,1,0,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423


# Compare customers behavior across geographic regions

In [13]:
# Check frequency of values within 'state' column
df_merged1['state'].value_counts(dropna = False)

state
Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mi

## All states will be divided by regions according to Wikipedia list: https://simple.wikipedia.org/wiki/List_of_regions_of_the_United_States

In [14]:
# Create 'regions' list to assign a region for each state
regions = []

for value in df_merged1['state']:
  if value in ['Maine','New Hampshire','Vermont','Massachusetts','Rhode Island','Connecticut','New York','Pennsylvania','New Jersey']:
    regions.append('Northeast')
  elif value in ['Wisconsin','Michigan','Illinois','Indiana','Ohio','North Dakota','South Dakota','Nebraska','Kansas','Minnesota','Iowa','Missouri']:
    regions.append('Midwest')
  elif value in ['Idaho','Montana','Wyoming','Nevada','Utah','Colorado','Arizona','New Mexico']:
    regions.append('West')
  else:
    regions.append('South')

In [15]:
# Create 'regions' column within dataframe
df_merged1['regions'] = regions

In [16]:
# Check the output
df_merged1.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,max_order,loyalty_flag,mean_order,spending_flag,median_order,customer_flag,gender,state,age,date_joined,num_of_dependants,marital_status,income,regions
0,2539329,1,1,2,8,11.0,196,1,0,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South


In [17]:
# Check frequency of values within 'regions' column
df_merged1['regions'].value_counts()

regions
South        14002341
Midwest       7597325
Northeast     5722736
West          5082457
Name: count, dtype: int64

## Compare customer spendings across regions

In [18]:
# Create crosstab between 'regions' and 'spending_flag' columns
crosstab_reg_spend = pd.crosstab(df_merged1['regions'], df_merged1['spending_flag'], dropna = False)

In [19]:
# Check the output
crosstab_reg_spend

spending_flag,High Spender,Low Spender
regions,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,155975,7441350
Northeast,108225,5614511
South,274413,13727928
West,95632,4986825


#### All regions share the same tendency of having more low spenders than high spenders. South region has the highest number of high- and low-spending customers.
####  West has the lowest percentage of low and high spenders.

##  5. Create an exclusion flag for low-activity customers and exclude them from the data

###  Create exclusion 'user_activity_flag'

In [20]:
# Set first condition (low-activity customers if orders number <5) 

df_merged1.loc[df_merged1['max_order'] < 5, 'user_activity_flag'] = 'Low activity'

In [21]:
# Set second condition (regular customers if orders number >=5)
df_merged1.loc[df_merged1['max_order'] >= 5, 'user_activity_flag'] = 'Normal activity'

In [22]:
# Check the output
df_merged1['user_activity_flag'].value_counts(dropna = False)

user_activity_flag
Normal activity    30964564
Low activity        1440295
Name: count, dtype: int64

In [23]:
# Check the output
df_merged1.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,max_order,loyalty_flag,mean_order,spending_flag,median_order,customer_flag,gender,state,age,date_joined,num_of_dependants,marital_status,income,regions,user_activity_flag
0,2539329,1,1,2,8,11.0,196,1,0,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,10,New customer,6.367797,Low Spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity


### Remove low-activity customers from dataframe

In [24]:
# Create subset df with only low-activity customers
df_low_activity = df_merged1[df_merged1['user_activity_flag'] == 'Low activity']

In [25]:
# Check the output
df_low_activity['user_activity_flag'].value_counts()

user_activity_flag
Low activity    1440295
Name: count, dtype: int64

In [26]:
# Create subset df with only normal-activity customers
df_normal_activity = df_merged1[df_merged1['user_activity_flag'] == 'Normal activity']

In [27]:
# Check the output
df_normal_activity['user_activity_flag'].value_counts()

user_activity_flag
Normal activity    30964564
Name: count, dtype: int64

In [28]:
# Check the total length of new dataframes
len(df_normal_activity) + len(df_low_activity)

32404859

## 6. Export dataframes

In [29]:
# Export ords_prods_customers dataframe in pkl format
df_merged1.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_final.pkl'))

In [30]:
# Export df_normal_activity dataframe in pkl format
df_normal_activity.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'subset_customers_normal_activity.pkl'))

In [31]:
# Export df_normal_activity dataframe in pkl format
df_low_activity.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'subset_customers_low_activity.pkl'))