# Coding Etiquitte and Excel Reporting

### List of Contents
#### 1. Data Load 
#### 2. Data Consistency
#### 3. Addressing Security Concerns
#### 4. Region Segmentation and Customer Behaviour
#### 5. Low-activity Customers

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

#### 1. Data Load

In [2]:
path = r'C:\Users\chris\OneDrive\Documents\11-23-2023 Instacart Basket Analysis'

In [3]:
df_instacart = pd.read_pickle(os.path.join(path,'02 Data', 'Prepared Data', 'instacart_data.pkl'))

#### 2. Data Consistency

In [4]:
# check dataframe
df_instacart.head(10)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,order_frequency_flag,first_name,surname,gender,state,age,date_joined,number_of_dependants,family_status,income
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [5]:
# checking columns in dataframe
df_instacart.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_last_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'prices_range_loc', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'average_price',
       'spending_flag', 'median_orders', 'order_frequency_flag', 'first_name',
       'surname', 'gender', 'state', 'age', 'date_joined',
       'number_of_dependants', 'family_status', 'income'],
      dtype='object')

In [6]:
# check data shape
df_instacart.shape

(32404859, 31)

#### 3. Addressing Security Concerns

In [7]:
# drop first name and surname from dataframe
df_instacart = df_instacart.drop(columns =['first_name','surname'])

The first name and surname columns had to be dropped from the dataframe due to matters relating to personal identifiable information.

#### 4. Region Segmentation and Customer Behaviour

In [8]:
# verify state values
df_instacart['state'].value_counts(dropna = False)

state
Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mi

In [10]:
# create list for Northeast region
Northeast = ['Maine','New Hampshire','Vermont','Massachusetts','Vermont','Rhode Island','Connecticut','New York','Pennsylvania','New Jersey']

In [11]:
# creat list for Midwest region
Midwest = ['Wisconsin','Michigan','Illinois','Indiana','Ohio','North Dakota','South Dakota','Nebraska','Kansas','Minnesota','Iowa','Missouri']

In [12]:
# create list for South Region
South = ['Delaware','Maryland','District of Columbia','Virginia','West Virginia','North Carolina','South Carolina','Georgia','Florida','Kentucky','Tennessee','Mississippi','Alabama','Oklahoma','Texas','Arkansas','Louisiana']

In [13]:
# create list for West region
West = ['Idaho','Montana','Wyoming','Nevada','Utah','Colorado','Arizona','New Mexico','Alaska','Washington','Oregon','California','Hawaii']

In [14]:
# create 'Northeast' variable for the 'region_flag' column
df_instacart.loc[df_instacart['state'].isin(Northeast),'region_flag'] = 'Northeast'

In [15]:
# create 'Midwest' variable for the 'region_flag' column
df_instacart.loc[df_instacart['state'].isin(Midwest),'region_flag'] = 'Midwest'

In [16]:
# create 'South'variable for the 'region_flag' column
df_instacart.loc[df_instacart['state'].isin(South),'region_flag'] = 'South'

In [17]:
# create 'West' variable for the 'region_flag' column
df_instacart.loc[df_instacart['state'].isin(West),'region_flag'] = 'West'

In [18]:
# check new column and variables
df_instacart['region_flag'].value_counts(dropna = False)

region_flag
South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: count, dtype: int64

The segmentation results showcases that most customer orders come from the south region of the United States, followed by the west region and then the midwest region. The northeast region has the least number of customer orders.

In [19]:
# create crosstab for consumer spending_flag and region_flag
region_spender_crosstab = pd.crosstab(df_instacart['region_flag'], df_instacart['spending_flag'], dropna = False)

In [20]:
region_spender_crosstab

spending_flag,High spender,Low spender
region_flag,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,155975,7441350
Northeast,108225,5614511
South,209691,10582194
West,160354,8132559


There isnt much difference between spenders per region in the United States.

#### 5. Low-activity customers

In [21]:
# create 'max_order' column in the dataframe
df_instacart['max_order'] = df_instacart.groupby(['user_id'])['order_number'].transform(np.max)

In [22]:
# create 'low-activity' value in the 'user_activity' column
df_instacart.loc[df_instacart['max_order'] < 5, 'user_activity'] = 'Low activity'

In [23]:
# create 'active' value in the 'user_activity' column
df_instacart.loc[df_instacart['max_order'] >= 5, 'user_activity'] = 'active'

In [24]:
# verify new column and values
df_instacart['user_activity'].value_counts(dropna = False)

user_activity
active          30964564
Low activity     1440295
Name: count, dtype: int64

In [25]:
# creata subset dataframe with 'active' value from the 'user_activity' column
df_active_users = df_instacart[df_instacart['user_activity'] == 'active']

In [26]:
# verify shape
df_active_users.shape

(30964564, 31)

In [27]:
# export new dataframe to project folder
df_active_users.to_pickle(os.path.join(path,'02 Data','Prepared Data','instacart_active_users.pkl'))

In [28]:
# export new dataframe to project folder
df_instacart.to_pickle(os.path.join(path,'02 Data','Prepared Data','instacart_data.pkl'))