# Table of Contents
### Import libraries
### Step 1: Import merged data set w/new customers column
### Step 2: Address PII Data
### Step 3: Create regional segmentation of data
### Step 4: Create exclusion flag for low-activity customers, remove from data
### Export Dataframe 

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# turning folder path into string
path = r'/Users/davesmac/Desktop/04-2022- Instacart Basket Analysis'

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Step 1: Import data set- merged version containing new customers column

In [4]:
# Import orders_products_all df
ords_prods_all = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))

In [5]:
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,Busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price_per_order,spender_flag,order_frequency,order_frequency_flag,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,marital_status,income,_merge
0,2539329.0,1.0,1.0,2.0,8.0,,True,196.0,1.0,0.0,Soda,77.0,7.0,9.0,Mid-range product,Regular busy,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


### Step 2: Address PII Data

In [6]:
# Drop name columns from df 
ords_prods_all = ords_prods_all.drop(columns = ['First Name', 'Surnam'])

In [7]:
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,Busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price_per_order,spender_flag,order_frequency,order_frequency_flag,Gender,STATE,Age,date_joined,n_dependants,marital_status,income,_merge
0,2539329.0,1.0,1.0,2.0,8.0,,True,196.0,1.0,0.0,Soda,77.0,7.0,9.0,Mid-range product,Regular busy,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [8]:
ords_prods_all.shape

(32399732, 31)

### Step 3: Create regional segmentation of data

In [9]:
#Create Region labels
region = []
for value in ords_prods_all['STATE']:
    if value in ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']:
        region.append('Northeast')
    elif value in ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']:
        region.append('Midwest')
    elif value in ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']:
        region.append('South')
    elif value in ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']:
        region.append('West')

In [10]:
#Create Region column with above labels 
ords_prods_all['Region'] = region

In [11]:
# Count of each region
ords_prods_all['Region'].value_counts(dropna = False)

South        10790096
West          8291679
Midwest       7596065
Northeast     5721892
Name: Region, dtype: int64

In [12]:
# Cross region with spending flag
crosstab_region_spending = pd.crosstab(ords_prods_all['Region'], ords_prods_all['spender_flag'], dropna = False)

In [13]:
crosstab_region_spending

spender_flag,High Spender,Low Spender
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,154715,7441350
Northeast,107381,5614511
South,207902,10582194
West,159120,8132559


#### In every region, there are more low spenders than high spenders.  There does not seem to be a noticable difference in spending habits accross regions. 

### Step 4: Create exclusion flag for low-activity customers, remove from data

In [17]:
#Create low-activity flag
ords_prods_all.loc[ords_prods_all['max_order'] < 5, 'activity_flag'] = 'low-activity'

In [18]:
#Create normal-activity flag
ords_prods_all.loc[ords_prods_all['max_order'] >= 5, 'activity_flag'] = 'normal-activity'

In [19]:
#Create sample excluding low-activity customers
ords_prods_all_norm_act = ords_prods_all[ords_prods_all['activity_flag'] == 'normal-activity']

In [20]:
ords_prods_all_norm_act.shape

(30959687, 33)

In [21]:
ords_prods_all_norm_act.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,Busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price_per_order,spender_flag,order_frequency,order_frequency_flag,Gender,STATE,Age,date_joined,n_dependants,marital_status,income,_merge,Region,activity_flag
0,2539329.0,1.0,1.0,2.0,8.0,,True,196.0,1.0,0.0,Soda,77.0,7.0,9.0,Mid-range product,Regular busy,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,normal-activity
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,normal-activity
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,normal-activity
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,normal-activity
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10.0,new customer,6.367797,Low Spender,20.259259,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,normal-activity


### Notebook Running Slow, had to seperate notebooks

In [22]:
#Export above sample
ords_prods_all_norm_act.to_pickle(os.path.join(path,'02 Data','Prepared Data','ords_prods_norm_act.pkl'))

In [15]:
# Export ords_prods_all- need to seperate notebooks
ords_prods_all.to_pickle(os.path.join(path, '02 Data','Prepared Data','ords_prods_all.pkl'))