# 📊 Task 4.8 – Aggregation, Flags & Customer Insights
**Andres G.**

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import os

In [None]:
# Set path
path = r'C:\Users\andd0\Documents\InstaCart Basket Analysis'

In [None]:
# Load the merged dataframe with previously derived columns
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_extra_columns_T4_7.pkl'))
ords_prods_merge.head()

## 1. Aggregation by `department_id` – Subset vs Full Dataset

In [None]:
# Create subset
df = ords_prods_merge[:1000000]

# Aggregation for subset
subset_agg = df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})
subset_agg

In [None]:
# Aggregation for entire dataset
full_agg = ords_prods_merge.groupby('department_id')['order_number'].mean()
full_agg

### 🔍 Analysis
The average `order_number` across departments remains largely consistent between the subset and full dataset. Minor fluctuations are present but not significant, suggesting the 1M row subset is fairly representative.

## 2. Creating `loyalty_flag`

In [None]:
# Calculate max order per user
ords_prods_merge['max_order'] = ords_prods_merge.groupby('user_id')['order_number'].transform('max')

# Assign loyalty flag
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal Customer'
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

# Check distribution
ords_prods_merge['loyalty_flag'].value_counts()

## 3. Price Comparison by Loyalty Category

In [None]:
# Average price by loyalty group
ords_prods_merge.groupby('loyalty_flag')['prices'].mean()

🔍 The average spending is fairly similar across all loyalty groups.

## 4. Creating `spending_flag`

In [None]:
# Average spend per user
ords_prods_merge['avg_spend_user'] = ords_prods_merge.groupby('user_id')['prices'].transform('mean')

# Assign spending flag
ords_prods_merge.loc[ords_prods_merge['avg_spend_user'] < 10, 'spending_flag'] = 'Low spender'
ords_prods_merge.loc[ords_prods_merge['avg_spend_user'] >= 10, 'spending_flag'] = 'High spender'

# Check distribution
ords_prods_merge['spending_flag'].value_counts()

## 5. Creating `order_frequency_flag`

In [None]:
# Median days between orders
ords_prods_merge['median_days_since_prior'] = ords_prods_merge.groupby('user_id')['days_since_prior_order'].transform('median')

# Assign frequency flag
ords_prods_merge.loc[ords_prods_merge['median_days_since_prior'] > 20, 'order_frequency_flag'] = 'Non-frequent customer'
ords_prods_merge.loc[(ords_prods_merge['median_days_since_prior'] > 10) & (ords_prods_merge['median_days_since_prior'] <= 20), 'order_frequency_flag'] = 'Regular customer'
ords_prods_merge.loc[ords_prods_merge['median_days_since_prior'] <= 10, 'order_frequency_flag'] = 'Frequent customer'

# Check distribution
ords_prods_merge['order_frequency_flag'].value_counts(dropna=False)

## 6. Export Final Data

In [None]:
# Export final dataframe
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_final_T4_8.pkl'))

✅ **Notebook complete. All flags and aggregation tasks executed. Data exported for further use.**