In [1]:
#Table of Contents for 4.10 Changing Types Column Derivations, Analysis
##Importing Data
##Changing Datatypes and dropping columns
##Deriving new columns and reviewing distributions
##Changing datatypes of newly created columns for faster run times

In [2]:
#Importing data
##01 Importing libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [3]:
##02 Defining "path" as a shortcut to the main Instacart folder on my desktop
path = r'/Users/emmawilcox/Desktop/InstacartGroceryBasketAnalysis'

In [4]:
##03 Importing InstacartGroceryBasketAnalysis as a dataframe, named "df_nolow_activity"
df_nolow_activity=pd.read_pickle(r'/Users/emmawilcox/Desktop/InstacartGroceryBasketAnalysis/Data/prepared data/nolow_activity.pkl')

In [5]:
#Changing datatypes for faster run times and dropping 3 columns
##04 Changing mean_order to a float16
df_nolow_activity['mean_order']=df_nolow_activity['mean_order'].astype('float16')

In [6]:
##05 Changing number_of_dependants to a float16
df_nolow_activity['number_of_dependants']=df_nolow_activity['number_of_dependants'].astype('float16')



In [7]:
##06 Changing Activity_level to a float16, changing product_name to a category
df_nolow_activity['Activity_Level']=df_nolow_activity['Activity_Level'].astype('int16')
df_nolow_activity['product_name']=df_nolow_activity['product_name'].astype('category')


In [8]:
##07 Changing loyalty_flag and spending_level to a category
df_nolow_activity['loyalty_flag']=df_nolow_activity['loyalty_flag'].astype('category')
df_nolow_activity['spending_level']=df_nolow_activity['spending_level'].astype('category')

In [9]:
##08 Changing frequency_of_customer and First Name to a category
df_nolow_activity['frequency_of_customer']=df_nolow_activity['frequency_of_customer'].astype('category')
df_nolow_activity['First Name']=df_nolow_activity['First Name'].astype('category')

In [10]:
##09 Changing Gender and STATE to a category
df_nolow_activity['Gender']=df_nolow_activity['Gender'].astype('category')
df_nolow_activity['STATE']=df_nolow_activity['STATE'].astype('category')

In [11]:
##10 Changing date_joined and family_status to a category
df_nolow_activity['date_joined']=df_nolow_activity['date_joined'].astype('category')
df_nolow_activity['family_status']=df_nolow_activity['family_status'].astype('category')

In [12]:
##11 Changing Region to a category
df_nolow_activity['Region']=df_nolow_activity['Region'].astype('category')

In [13]:
##12 Using .drop to remove 3 unwanted columns
df_nolow_activity.drop(columns = {'_merge', 'Check', 'New Check', 'Surname' }, inplace = True)

In [14]:
#Deriving new columns and reviewing distributions
##13 Using .loc instead of a user defined function, to sort customer incomes into four ranges within column, "Income Range" part 1
df_nolow_activity.loc[df_nolow_activity['income'] <= 40000, 'Income Range'] = 'Lowest Income'

In [15]:
##14 Using .loc instead of a user defined function, to sort customer incomes into four ranges, part 2
df_nolow_activity.loc[(df_nolow_activity['income'] > 40000) & (df_nolow_activity['income'] < 70000), 'Income Range'] = 'Low Income'

In [16]:
##15 Using .loc instead of a user defined function, to sort customer incomes into four ranges, part 3
df_nolow_activity.loc[(df_nolow_activity['income'] > 70000) & (df_nolow_activity['income'] < 100000), 'Income Range'] = 'Mid Income'

In [17]:
##16 Using .loc instead of a user defined function, to sort customer incomes into four ranges, part 4
df_nolow_activity.loc[df_nolow_activity['income'] >= 100000, 'Income Range'] = 'High Income'

In [18]:
##17 Using .valuecounts to review range of Income Range, High Income is the majority
df_nolow_activity['Income Range'].value_counts(dropna = False)

High Income      13108122
Mid Income        7598874
Low Income        6342876
Lowest Income     1467194
NaN                   182
Name: Income Range, dtype: int64

In [19]:
##18 Grouping Age into 3 Lifestages, part 1
result1 = []

for value in df_nolow_activity['Age']:                              
  if value in range(0, 25):
    result1.append("Youth Shopper")
  elif value in range(26, 54):
    result1.append("Mid Age Shopper")
  else:
    result1.append("Senior Shopper")

In [24]:
##18 Grouping Age into 3 Lifestages, part 2
df_nolow_activity['Lifestage'] = result1

In [25]:
##19 Using .valuecounts to review range of Lifestage, customers ages 55+ and 25-54 are nearly the same, with a minority of customers under age 25 
df_nolow_activity['Lifestage'].value_counts(dropna = False)

Senior Shopper     12861555
Mid Age Shopper    12505254
Youth Shopper       3150439
Name: Lifestage, dtype: int64

In [26]:
##20 Using .valuecounts to review range of existing column, number_of_dependants, distribution is very even
df_nolow_activity['number_of_dependants'].value_counts(dropna = False)

3.0    7151063
2.0    7126908
0.0    7122184
1.0    7117093
Name: number_of_dependants, dtype: int64

In [27]:
##21 Using .loc instead of a user defined function, to sort customers by whether there are any children in household, part 1
df_nolow_activity.loc[df_nolow_activity['number_of_dependants'] <= 0, 'Status_Kids'] = 'No'

In [28]:
##22 Using .loc instead of a user defined function, to sort customers by whether there are any children in household, part 2
df_nolow_activity.loc[df_nolow_activity['number_of_dependants'] >= 1, 'Status_Kids'] = 'Yes'

In [29]:
##23 Using .valuecounts to review range of Status_Kids, majority of customers have children at home
df_nolow_activity['Status_Kids'].value_counts(dropna = False)

Yes    21395064
No      7122184
Name: Status_Kids, dtype: int64

In [30]:
##24 Using .loc instead of a user defined function, to sort customer households by how many children are present, to create an approximate household size estimate, part 1
df_nolow_activity.loc[(df_nolow_activity['number_of_dependants'] >= 0) & (df_nolow_activity['number_of_dependants'] < 1), 'Size of Family'] = 'Smaller Family'

In [31]:
##25 Using .loc instead of a user defined function, to sort customer households by how many children are present, to create an approximate household size estimate, part 2
df_nolow_activity.loc[(df_nolow_activity['number_of_dependants'] >= 1) & (df_nolow_activity['number_of_dependants'] <= 2), 'Size of Family'] = 'Midsize Family'

In [32]:
##26 Using .loc instead of a user defined function, to sort customer households by how many children are present, to create an approximate household size estimate, part 3
df_nolow_activity.loc[(df_nolow_activity['number_of_dependants'] >= 3), 'Size of Family'] = 'Larger Family'

In [33]:
##27 Using .valuecounts to review range of Size of Family, 50% of customers have 1-2 children at home
df_nolow_activity['Size of Family'].value_counts(dropna = False)

Midsize Family    14244001
Larger Family      7151063
Smaller Family     7122184
Name: Size of Family, dtype: int64

In [34]:
##28 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 1
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'No') & (df_nolow_activity['Income Range'] == 'Lowest Income'), 'Household_Kind_Budget'] = 'Low_No_Kids'

In [35]:
##29 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 2
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'No') & (df_nolow_activity['Income Range'] == 'Low Income'), 'Household_Kind_Budget'] = 'Low_No_Kids'

In [36]:
##30 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 3
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'No') & (df_nolow_activity['Income Range'] == 'Mid Income'), 'Household_Kind_Budget'] = 'Mid_No_Kids'

In [37]:
##31 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 4
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'No') & (df_nolow_activity['Income Range'] == 'High Income'), 'Household_Kind_Budget'] = 'High_No_Kids'

In [38]:
##32 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 5
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'Yes') & (df_nolow_activity['Income Range'] == 'Lowest Income'), 'Household_Kind_Budget'] = 'Low_With_Kids'

In [39]:
##33 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 6
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'Yes') & (df_nolow_activity['Income Range'] == 'Low Income'), 'Household_Kind_Budget'] = 'Low_With_Kids'

In [40]:
##34 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 7
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'Yes') & (df_nolow_activity['Income Range'] == 'Mid Income'), 'Household_Kind_Budget'] = 'Mid_With_Kids'

In [41]:
##35 Using .loc instead of a user defined function, to derive a new column, "Household_Kind_Budget" with 6 categories, sorting customers by income level and whether children are home, part 8
df_nolow_activity.loc[(df_nolow_activity['Status_Kids'] == 'Yes') & (df_nolow_activity['Income Range'] == 'High Income'), 'Household_Kind_Budget'] = 'High_With_Kids'

In [51]:
##36 Using .valuecounts to review range of Household_Kind_Budget, High income households with children are the majority
df_nolow_activity['Household_Kind_Budget'].value_counts(dropna = False)

High_With_Kids    9814792
Low_With_Kids     5858248
Mid_With_Kids     5721985
High_No_Kids      3293330
Low_No_Kids       1951822
Mid_No_Kids       1876889
NaN                   182
Name: Household_Kind_Budget, dtype: int64

In [52]:
##37 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 1
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'single') & (df_nolow_activity['Income Range'] == 'Low Income'), 'Household_Status_Budget'] = 'Low_Single'

In [53]:
##38 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 2
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'single') & (df_nolow_activity['Income Range'] == 'Lowest Income'), 'Household_Status_Budget'] = 'Low_Single'

In [54]:
##39 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 3
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'single') & (df_nolow_activity['Income Range'] == 'Mid Income'), 'Household_Status_Budget'] = 'Mid_Single'

In [55]:
##40 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 3
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'single') & (df_nolow_activity['Income Range'] == 'High Income'), 'Household_Status_Budget'] = 'High_Single'

In [56]:
##41 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 4
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'married') & (df_nolow_activity['Income Range'] == 'Low Income'), 'Household_Status_Budget'] = 'Low_Married'

In [47]:
##42 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 5
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'married') & (df_nolow_activity['Income Range'] == 'Lowest Income'), 'Household_Status_Budget'] = 'Low_Married'

In [57]:
##43 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 6 
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'married') & (df_nolow_activity['Income Range'] == 'Mid Income'), 'Household_Status_Budget'] = 'Mid_Married'

In [58]:
##44 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 7
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'married') & (df_nolow_activity['Income Range'] == 'High Income'), 'Household_Status_Budget'] = 'High_Married'

In [59]:
##45 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 8
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'divorced/widowed') & (df_nolow_activity['Income Range'] == 'Low Income'), 'Household_Status_Budget'] = 'Low_DivWid'

In [60]:
##46 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 9
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'divorced/widowed') & (df_nolow_activity['Income Range'] == 'Lowest Income'), 'Household_Status_Budget'] = 'Low_DivWid'

In [61]:
##47 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 10 
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'divorced/widowed') & (df_nolow_activity['Income Range'] == 'Mid Income'), 'Household_Status_Budget'] = 'Mid_DivWid'

In [62]:
##48 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 11
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'divorced/widowed') & (df_nolow_activity['Income Range'] == 'High Income'), 'Household_Status_Budget'] = 'High_DivWid'

In [63]:
##49 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 12
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'living with parents and siblings') & (df_nolow_activity['Income Range'] == 'Low Income'), 'Household_Status_Budget'] = 'Low_Other'

In [64]:
##50 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 13
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'living with parents and siblings') & (df_nolow_activity['Income Range'] == 'Lowest Income'), 'Household_Status_Budget'] = 'Low_Other'

In [65]:
##51 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 14
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'living with parents and siblings') & (df_nolow_activity['Income Range'] == 'Mid Income'), 'Household_Status_Budget'] = 'Mid_Other'

In [66]:
##52 Using .loc instead of a user defined function, to derive a new column, "Household_Status_Budget" with 12 categories, sorting customers by income level and marital status/household type, part 15
df_nolow_activity.loc[(df_nolow_activity['family_status'] == 'living with parents and siblings') & (df_nolow_activity['Income Range'] == 'High Income'), 'Household_Status_Budget'] = 'High_Other'

In [67]:
##53 Using .valuecounts to review range of Household_Status_Budget, Married people represent the majority of the customer base
df_nolow_activity['Household_Status_Budget'].value_counts(dropna = False)

High_Married    9770536
Low_Married     5176743
Mid_Married     5077354
High_DivWid     1711569
Low_Single      1627586
High_Single     1581761
Mid_Single      1465153
Low_Other        681505
Mid_Other        644631
Mid_DivWid       411736
Low_DivWid       324236
High_Other        44256
NaN                 182
Name: Household_Status_Budget, dtype: int64

In [54]:
##54 Using .loc instead of a user defined function, to sort customers based on whether they've ordered pet products, part 1
df_nolow_activity.loc[df_nolow_activity['department_id'] == 8, 'Pet_Supply_Buyer'] = 'Yes'

In [55]:
##55 Using .loc instead of a user defined function, to sort customers based on whether they've ordered pet products, part 2
df_nolow_activity.loc[df_nolow_activity['department_id'] != 8, 'Pet_Supply_Buyer'] = 'No'

In [56]:
##56 Using .valuecounts to review distribution of customers who have bought pet products
df_nolow_activity['Pet_Supply_Buyer'].value_counts(dropna = False)

No     28431992
Yes       85256
Name: Pet_Owner, dtype: int64

In [69]:
##55 Using .loc instead of a user defined function, to sort customers based on whether they've bought items from the Meat/Seafood and Dairy/Eggs depts, part 1
df_nolow_activity.loc[df_nolow_activity['department_id'].isin([12, 16]), 'Possible Vegan'] = 'No'

In [70]:
##56 Using .loc instead of a user defined function, to sort customers based on whether they've bought items from the Meat/Seafood and Dairy/Eggs depts, part 2
df_nolow_activity.loc[~df_nolow_activity['department_id'].isin([12, 16]), 'Possible Vegan'] = 'Yes'

In [73]:
##57 Using .valuecounts to review distribution of customers who have bought meat, seafood, dairy or eggs, 81% of th customer base is not ordering from these depts, so a subset of that large segment is where vegans would be found
df_nolow_activity['Possible Vegan'].value_counts(dropna = False)

Yes    23125434
No      5391814
Name: Possible Vegan, dtype: int64

In [74]:
##58 Using .loc instead of a user defined function, to sort customers based on whether they've bought items from the Meat/Seafood dept, part 1
df_nolow_activity.loc[df_nolow_activity['department_id'] == 12, 'Meat Buyer'] = 'Yes'

In [75]:
##59 Using .loc instead of a user defined function, to sort customers based on whether they've bought items from the Meat/Seafood dept, part 2
df_nolow_activity.loc[df_nolow_activity['department_id'] != 12, 'Meat Buyer'] = 'No'

In [76]:
##60 Using .valuecounts to review distribution of customers who have bought meat or seafood, only 2% of customer base is using instacart for meat/seafood buying
df_nolow_activity['Meat Buyer'].value_counts(dropna = False)

No     27896256
Yes      620992
Name: Meat Buyer, dtype: int64

In [78]:
##61 Using .loc instead of a user defined function, to sort customers based on whether they've bought baby items, part 1
df_nolow_activity.loc[df_nolow_activity['department_id'] == 18, 'Baby Item Buyer'] = 'Yes'

In [79]:
##62 Using .loc instead of a user defined function, to sort customers based on whether they've bought baby items, part 2
df_nolow_activity.loc[df_nolow_activity['department_id'] != 18, 'Baby Item Buyer'] = 'No'

In [80]:
##63 Using .valuecounts to review distribution of customers who have bought baby items, 75% of customer base is households with children, but very few customers are using instacart for for baby needs they may have
df_nolow_activity['Baby Item Buyer'].value_counts(dropna = False)

No     28136418
Yes      380830
Name: Baby Item Buyer, dtype: int64

In [66]:
##64 Using .groupby and .agg to get the average maximum order for each department, by region, the Northeast has the highest average number of orders
df_nolow_activity.groupby('Region').agg({'max_order': ['mean']})

Unnamed: 0_level_0,max_order
Unnamed: 0_level_1,mean
Region,Unnamed: 1_level_2
Midwest Region,34.791129
Northeast Region,35.42675
South Region,35.101279
West Region,35.282401


In [87]:
##64 Using .groupby and .agg to get basic statistics for each department, by region, the Midwest has the lowest average number of orders
df_nolow_activity.groupby('Region').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Midwest Region,18.038208,1,99
Northeast Region,18.350079,1,99
South Region,18.149098,1,99
West Region,18.271768,1,99


In [88]:
##65 Using .groupby and .mean to get average price of items purchased by each user, customers in the West pay the highest average prices in their orders
df_nolow_activity.groupby('Region')['prices'].mean()

Region
Midwest Region      7.793742
Northeast Region    7.782205
South Region        7.789014
West Region         7.794364
Name: prices, dtype: float64

In [67]:
##66 Using .groupby and .mean to get average price of items purchased by each customert group, in terms of income and whether children are present. Notably, mid-income families with children pay the highest average product prices
df_nolow_activity.groupby('Household_Kind_Budget')['prices'].mean()

Household_Kind_Budget
High_No_Kids      7.947858
High_With_Kids    7.951211
Low_No_Kids       7.363180
Low_With_Kids     7.364253
Mid_No_Kids       7.952039
Mid_With_Kids     7.955760
Name: prices, dtype: float64

In [89]:
##67 Using .groupby and .mean to get average price of items purchased by each customert group, in terms of income and marital status. Notably, mid-income divorced or widowed households pay the highest average product prices
df_nolow_activity.groupby('Household_Status_Budget')['prices'].mean()

Household_Status_Budget
High_DivWid     7.950298
High_Married    7.951309
High_Other      7.929603
High_Single     7.945217
Low_DivWid      6.788384
Low_Married     7.326697
Low_Other       7.649533
Low_Single      7.477686
Mid_DivWid      7.978580
Mid_Married     7.955010
Mid_Other       7.961663
Mid_Single      7.944580
Name: prices, dtype: float64

In [93]:
##68 Grouping customers based on when they order, part 1. Night people = those who order between 10PM and 4AM, morning people = those who order between 5Am and 8Am, and regular people = those who order between 9AM and 9PM
result2 = []

for value in df_nolow_activity['time_order_placed']:                              
  if value in (0, 1, 2, 3, 4, 22, 23):
    result2.append("Night People")
  elif value in (5, 6, 7, 8):
    result2.append("Morning People")
  else:
    result2.append("Regular")

In [94]:
##69 Grouping customers based on when they order, part 2
df_nolow_activity['Time Ordering'] = result2

In [95]:
##70 Reviewing distribution of this demographic, the majority of orders are placed during regular hours
df_nolow_activity['Time Ordering'].value_counts(dropna = False)

Regular           24489823
Morning People     2674751
Night People       1352674
Name: Time Ordering, dtype: int64

In [96]:
##71 Using .groupby and .agg to get average maximum order from different times, Morning people order more frequently
df_nolow_activity.groupby('Time Ordering').agg({'max_order': ['mean']})

Unnamed: 0_level_0,max_order
Unnamed: 0_level_1,mean
Time Ordering,Unnamed: 1_level_2
Morning People,38.623792
Night People,34.35449
Regular,34.818667


In [97]:
##72 Using .groupby and .mean to get average price of items purchased at different times, Night people buy the highest-priced products
df_nolow_activity.groupby('Time Ordering')['prices'].mean()

Time Ordering
Morning People    7.804928
Night People      7.811884
Regular           7.788284
Name: prices, dtype: float64

In [74]:
##73 Using .loc instead of a user defined function, to sort customers based on whether they've bought alcohol, part 1
df_nolow_activity.loc[df_nolow_activity['department_id'] == 5, 'Alcohol Buyer'] = 'Yes'

In [75]:
##74 Using .loc instead of a user defined function, to sort customers based on whether they've bought alcohol, part 2
df_nolow_activity.loc[df_nolow_activity['department_id'] != 5, 'Alcohol Buyer'] = 'No'

In [76]:
##75 Reviewing distribution of alcohol-buying customers, the majority of the customer base is not using instacart for this kind of product, and instacart only delivers alcohol in certain states
df_nolow_activity['Alcohol Buyer'].value_counts(dropna = False)

No     28384152
Yes      133096
Name: Alcohol Buyer, dtype: int64

In [103]:
##76 Creating a crosstab to review which depts are most popular with high spenders, part 1
crosstab_dept_level = pd.crosstab(df_nolow_activity['department_id'], df_nolow_activity['spending_level'], dropna = False)


In [104]:
##77 Creating a crosstab to review which depts are most popular with high spenders, part 2
crosstab_dept_level.to_clipboard()

In [78]:
#Changing datatypes of newly created columns for faster run times
##78 Reviewing columns and datatypes using .info
df_nolow_activity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28517248 entries, 0 to 32399731
Data columns (total 44 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   order_id                 int64   
 1   user_id                  int64   
 2   order_number             int64   
 3   orders_day_of_week       int8    
 4   time_order_placed        int8    
 5   days_since_prior_order   float16 
 6   product_id               int64   
 7   add_to_cart_order        int8    
 8   reordered                int8    
 9   product_name             category
 10  aisle_id                 float16 
 11  department_id            float16 
 12  prices                   float64 
 13  price_range_loc          object  
 14  busiest_days_column      object  
 15  busiest_period_of_day    object  
 16  max_order                int64   
 17  loyalty_flag             category
 18  mean_order               float16 
 19  spending_level           category
 20  median_order_time     

In [79]:
##79 Changing Income Range from object to category
df_nolow_activity['Income Range']=df_nolow_activity['Income Range'].astype('category')

In [80]:
##80 Changing Household_Status_Budget, price_range_loc, and busiest_days_column from objects to categories
df_nolow_activity['Household_Status_Budget']=df_nolow_activity['Household_Status_Budget'].astype('category')
df_nolow_activity['price_range_loc']=df_nolow_activity['price_range_loc'].astype('category')
df_nolow_activity['busiest_days_column']=df_nolow_activity['busiest_days_column'].astype('category')

In [81]:
##81 Changing Lifestage, Status_Kids, Size of Family, and Household_Kind_Budget from objects to categories
df_nolow_activity['Lifestage']=df_nolow_activity['Lifestage'].astype('category')
df_nolow_activity['Status_Kids']=df_nolow_activity['Status_Kids'].astype('category')
df_nolow_activity['Size of Family']=df_nolow_activity['Size of Family'].astype('category')
df_nolow_activity['Household_Kind_Budget']=df_nolow_activity['Household_Kind_Budget'].astype('category')

In [82]:
##82 Changing Pet_Owner, Possible Vegan, Meat Buyer, Baby Item Buyer, Time Ordering and Alcohol Buyer from objects to categories
df_nolow_activity['Pet_Supply_Buyer']=df_nolow_activity['Pet_Supply_Buyer'].astype('category')
df_nolow_activity['Possible Vegan']=df_nolow_activity['Possible Vegan'].astype('category')
df_nolow_activity['Meat Buyer']=df_nolow_activity['Meat Buyer'].astype('category')
df_nolow_activity['Baby Item Buyer']=df_nolow_activity['Baby Item Buyer'].astype('category')
df_nolow_activity['Time Ordering']=df_nolow_activity['Time Ordering'].astype('category')
df_nolow_activity['Alcohol Buyer']=df_nolow_activity['Alcohol Buyer'].astype('category')

In [83]:
##83 Exporting subset as a pkl, "allcolumnsforvisuals"
df_nolow_activity.to_pickle(os.path.join(path, 'Data','Prepared Data', 'allcolumnsforvisuals.pkl'))