## Grouping Data & Aggregating Variables

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## 01. Grouping data

In [4]:
# Importing data frames

path = r'C:\Users\efens\cf_tasks\2023-07 Instacard Basket Analysis'
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', '022 Prepared Data', 'orders_products_merged.pkl'))

In [5]:
# Creating a subset for the first one million entires

df = ords_prods_merged[:1000000]

In [21]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [23]:
df.shape

(1000000, 14)

#### Grouping Data with pandas

In [24]:
## Using the groupby() function

df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000028613C650D0>

## 02. Performing a Single Aggregation

Calculating the mean of the “order_number” column grouped by the “department_id” column

- Split the data into groups based on department_id
- Apply the agg() function to each group to obtain the mean values for the “order_number” column

In [25]:
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [26]:
# testing on the whole data frame

ords_prods_merged.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
2,17.27792
3,17.170395
4,17.811403
5,15.215751
6,16.439806
7,17.225802
8,15.34065
9,15.895474
10,20.197148


In [27]:
# calculating the mean value for each department

df.groupby('department_id')['order_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

In [28]:
# other way

df.groupby('department_id').order_number.mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

## 03. Performing Multiple Aggregations

In [29]:
# Executing other statistical valued from the variable "order_number"

df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


#### Aggregating Data with transform()

Creating a flag for loyal customers:

1. max orders > 40 --> Loyal customer
2. max orders > 10 and <= 40 --> Regular customer
3. max orders < 10 --> New customer

Steps for creating: 

1. Split the data into groups based on the “user_id” column
2. Apply the transform() function on the “order_number” column to generate the maximum orders for each user
3. Create a new column, “max_order,” into which you’ll place the results of your aggregation

In [30]:
# all steps from above using a single code line

ords_prods_merged['max_order'] = ords_prods_merged.groupby(['user_id'])['order_number'].transform(np.max)

In [31]:
ords_prods_merged['max_order']

0           10
1           10
2           10
3           10
4           10
            ..
32404854    31
32404855    31
32404856     3
32404857     3
32404858    16
Name: max_order, Length: 32404859, dtype: int64

In [33]:
ords_prods_merged.head(15)

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order
0,2539329,1,prior,1,2,8,7.0,196,1,0,Soda,77,7,9.0,both,10
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,10
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,10
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,10
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,10
5,3367565,1,prior,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,10
6,550135,1,prior,7,1,9,20.0,196,1,1,Soda,77,7,9.0,both,10
7,3108588,1,prior,8,1,14,14.0,196,2,1,Soda,77,7,9.0,both,10
8,2295261,1,prior,9,1,16,0.0,196,4,1,Soda,77,7,9.0,both,10
9,2550362,1,prior,10,4,8,30.0,196,1,1,Soda,77,7,9.0,both,10


In [34]:
ords_prods_merged.head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order
0,2539329,1,prior,1,2,8,7.0,196,1,0,Soda,77,7,9.0,both,10
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,10
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,10
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,10
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3226575,360,prior,1,5,12,7.0,196,1,0,Soda,77,7,9.0,both,3
96,1469869,377,prior,3,5,17,3.0,196,9,0,Soda,77,7,9.0,both,3
97,1927023,387,prior,2,4,10,22.0,196,3,0,Soda,77,7,9.0,both,8
98,858092,420,prior,4,1,19,30.0,196,2,0,Soda,77,7,9.0,both,22


In [36]:
# Removing output limit from 100 to none

pd.options.display.max_rows = None

## 04. Deriving Columns with loc()

In [39]:
# max orders > 40 --> Loyal customer

ords_prods_merged.loc[ords_prods_merged['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [40]:
# max orders > 10 and <= 40 --> Regular customer

ords_prods_merged.loc[(ords_prods_merged['max_order'] <= 40) & (ords_prods_merged['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [41]:
# max orders < 10 --> New customer

ords_prods_merged.loc[ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [43]:
ords_prods_merged['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [45]:
# Checking

ords_prods_merged[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


In [46]:
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order,loyalty_flag
0,2539329,1,prior,1,2,8,7.0,196,1,0,Soda,77,7,9.0,both,10,New customer
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,10,New customer
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,10,New customer
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,10,New customer
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,10,New customer


In [50]:
# Exclude the eval_set column

ords_prods_merged = ords_prods_merged.loc[:, ords_prods_merged.columns != 'eval_set'] 

In [51]:
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order,loyalty_flag
0,2539329,1,1,2,8,7.0,196,1,0,Soda,77,7,9.0,both,10,New customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,10,New customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,10,New customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,10,New customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,10,New customer


In [52]:
# Exporting aggregated df

ords_prods_merged.to_pickle(os.path.join(path, '02 Data','022 Prepared Data', 'ords_prods_merged_agg_df.pkl'))