## Creating a data profiling

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Changing the format of the values visualization

pd.options.display.float_format = '{:.2f}'.format

In [3]:
# Reseting the maximum number of columns to display to default (all columns)

pd.set_option('display.max_columns', None)

In [4]:
# Defining the data access path

path = r'C:\Users\efens\cf_tasks\2023-07 Instacard Basket Analysis'

In [5]:
# Importing the latest project data without low-activity customers

profiling_data = pd.read_pickle(os.path.join(path, '02 Data', '022 Prepared Data', 'filtered_merged_all.pkl'))

In [6]:
profiling_data.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spenders,median_days,frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,activity_flag
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Most orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer


## 01. Exploring data

In [7]:
# Calculate and display the count of each unique value in the 'gender' column

profiling_data['gender'].value_counts()

Male      15182305
Female    14989119
Name: gender, dtype: int64

In [8]:
# Calculate and display the count of each unique value in the 'fam_status' column

profiling_data['fam_status'].value_counts()

married                             21188130
single                               4961333
divorced/widowed                     2577880
living with parents and siblings     1444081
Name: fam_status, dtype: int64

In [9]:
# Calculate and display the count of each unique value in the 'age' column

profiling_data['age'].value_counts()

49    495801
79    495312
31    493449
48    493190
64    491688
       ...  
60    450282
36    449409
66    442594
41    439269
25    436892
Name: age, Length: 64, dtype: int64

In [10]:
# Exploring values in the 'age' column

profiling_data['age'].describe()

count   30171424.00
mean          49.47
std           18.49
min           18.00
25%           33.00
50%           49.00
75%           65.00
max           81.00
Name: age, dtype: float64

In [11]:
# Exploring values in the 'income' column

profiling_data['income'].describe()

count   30171424.00
mean       99770.20
std        43183.42
min        25903.00
25%        67424.00
50%        96823.00
75%       128150.00
max       593901.00
Name: income, dtype: float64

#### Creating a varibale age_goups

In [12]:
# Defining Age_Groups

def age_groups(row):
    age = row['age']
    age_18_to_24 = (age >= 18) & (age <= 24)
    age_25_to_40 = (age >= 25) & (age <= 40)
    age_41_to_54 = (age >= 41) & (age <= 54)
    age_55_to_64 = (age >= 55) & (age < 65)
    over_65 = age >= 65
    
    if age_18_to_24:
        return '18-24'
    elif age_25_to_40:
        return '25-40'
    elif age_41_to_54:
        return '41-54'
    elif age_55_to_64:
        return '55-64'
    elif over_65:
        return '65+'
    else:
        return 'Not enough data' # e.g., missing values

In [13]:
# Applying the function age_groups to each row and create a new column 'age_group'

profiling_data['age_group'] = profiling_data.apply(age_groups, axis=1)

In [14]:
# Calculating the results for 'age_group'

profiling_data['age_group'].value_counts()

65+      7989725
25-40    7536161
41-54    6640299
55-64    4678036
18-24    3327203
Name: age_group, dtype: int64

In [15]:
# Checking results

profiling_data.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spenders,median_days,frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,activity_flag,age_group
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Most orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40


## 02. Creating profiles

In [16]:
# Exoloring data for data profiling

profiling_data.groupby(['age_group', 'fam_status', 'n_dependants']).size()

age_group  fam_status                        n_dependants
18-24      living with parents and siblings  1                495834
                                             2                472168
                                             3                476079
           married                           1                365594
                                             2                342553
                                             3                371953
           single                            0                803022
25-40      married                           1               1855520
                                             2               1905789
                                             3               1871424
           single                            0               1903428
41-54      married                           1               1684743
                                             2               1629821
                                             

#### Profiles:

In [17]:
## Defining function for data profiling

def demo_profiles(row):
    age = row['age']
    fam_status = row['fam_status']
    n_dependants = row['n_dependants']
    
    if (fam_status == 'living with parents and siblings'):
        # Profile: Adults Living with Parents
        # Family Status: 'living with parents and siblings'
        # Age: '18-24'
        # Number of Dependents: '1-3'
        return 'Living with Parents'
    
    elif (fam_status == 'single') and (age < 25):
        # Profile: Living Alone
        # Family Status: 'single'
        # Age: < 25
        # Number of Dependents: '0'
        return 'Young Living Alone'
    
    elif (fam_status == 'single') and (age >= 25):
        # Profile: Living Alone
        # Family Status: 'single'
        # Age: >= 25
        # Number of Dependents: '0'
        return 'Adult Living Alone'
     
    elif (fam_status == 'married') and (n_dependants == 1):
        # Profile: Married Without Children
        # Family Status: 'married'
        # Age: all
        # Number of Dependents: '1'
        return 'Married Without Children'
        
    elif (fam_status == 'married') and (n_dependants in [2, 3]):
        # Profile: Married With Children
        # Family Status: 'married'
        # Age: all
        # Number of Dependents: '1'
        return 'Married With Children' 
    
    elif fam_status == 'divorced/widowed':
        # Profile: Divorced / Widowed
        # Family Status: 'divorced/widowed'
        # Age: all
        # Number of Dependents: all
        return 'Divorced / Widowed' 
    
    else:
        # Undefined Profile
        return 'Undefined Profile'

In [18]:
# Applying the function demo_profiles and create a new column 'profile'

profiling_data['profile'] = profiling_data.apply(demo_profiles, axis=1)

In [19]:
# Checking the output

profiling_data['profile'].value_counts()

Married With Children       14160321
Married Without Children     7027809
Adult Living Alone           4158311
Divorced / Widowed           2577880
Living with Parents          1444081
Young Living Alone            803022
Name: profile, dtype: int64

In [20]:
# Checking results

profiling_data.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spenders,median_days,frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,activity_flag,age_group,profile
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40,Married With Children
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40,Married With Children
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40,Married With Children
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Average orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40,Married With Children
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busiest days,Most orders,10,New customer,6.37,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both,3_South,high-activity customer,25-40,Married With Children


## 03. Exporting changed df

In [21]:
# Exproting as a separate df

profiling_data.to_pickle(os.path.join(path, '02 Data','022 Prepared Data', 'profiling_data.pkl'))