## Add behavioral features, and product relationships 

In [39]:
import pandas as pd
trans_df_with_promotions = pd.read_csv('data/trans_with_promotions.csv')

In [40]:
def add_behavioral_features(trans_df, price_col="transaction_price", discount_flag="promotion_applied"):
    # ---------- User-level features ----------
    basket_size = (
        trans_df.groupby("user_id")["product_code"].count()
        / trans_df.groupby("user_id")["transaction_id"].nunique()
    ).rename("avg_basket_size")

    avg_spend = (
        trans_df.groupby("user_id")[price_col].mean()
        .rename("avg_spend")
    )

    discount_sens = (
        trans_df.groupby("user_id")[discount_flag].mean()
        .rename("discount_sensitivity")
    )

    user_features = pd.concat([basket_size, avg_spend, discount_sens], axis=1).reset_index()

    # ---------- Category-level features ----------
    cat_features = (
        trans_df.groupby(["user_id", "category"]).agg(
            basket_size_cat=("product_code", "count"),   # total products
            unique_txn_cat=("transaction_id", "nunique"),
            avg_spend_cat=(price_col, "mean"),
            discount_sens_cat=(discount_flag, "mean")
        ).reset_index()
    )

    # Adjust basket size per transaction
    cat_features["basket_size_cat"] = cat_features["basket_size_cat"] / cat_features["unique_txn_cat"]
    cat_features = cat_features.drop(columns=["unique_txn_cat"])

    # ---------- Return both ----------
    return [user_features, cat_features]


In [41]:
[user_features, cat_features] = add_behavioral_features(trans_df_with_promotions)

In [42]:
def add_product_relationships(trans_df):
    # category diversity: number of unique categories bought
    cat_diversity = trans_df.groupby("user_id")["category"].nunique().rename("category_diversity")

    # loyalty index: ratio of purchases from top category vs all categories
    def loyalty(x):
        counts = x.value_counts()
        return counts.max() / counts.sum()

    loyalty_index = trans_df.groupby("user_id")["category"].apply(loyalty).rename("loyalty_index")
    
    return pd.concat([cat_diversity, loyalty_index], axis=1).reset_index()

In [43]:
user_features.head()

Unnamed: 0,user_id,avg_basket_size,avg_spend,discount_sensitivity
0,user_1,4.492483,12.876816,0.160937
1,user_10,4.479574,13.268269,0.148771
2,user_11,4.492308,12.926989,0.149328
3,user_12,4.468908,13.416525,0.148469
4,user_13,5.488655,8.561471,0.150627


In [44]:
cat_features.head()

Unnamed: 0,user_id,category,basket_size_cat,avg_spend_cat,discount_sens_cat
0,user_1,AIRCARE,1.024793,12.35983,0.137097
1,user_1,ASIAN FOODS,1.054545,5.012506,0.193966
2,user_1,BAKERY BOUGHT IN,1.0,8.115473,0.294118
3,user_1,BAKERY PACKAGED CAKE,1.0,5.771864,0.230769
4,user_1,BAKERY SNACKS,1.0,5.816496,0.269231


In [45]:
# Merge on both user_id  
all_features_df = trans_df_with_promotions.merge(
    user_features,
    on=['user_id'],   # common keys
    how='left'                        # keep all transactions, add prefs if available
)

# Merge on both user_id and product_code
all_features_df = all_features_df.merge(
    cat_features,
    on=['user_id','category'],   # common keys
    how='left'                        # keep all transactions, add prefs if available
)

len(all_features_df) 

1338619

In [46]:
all_features_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'transaction_id', 'user_id',
       'product_code', 'category', 'item_name', 'discount_percentage',
       'transaction_date', 'transaction_price', 'age_group', 'gender',
       'income_bracket', 'customer_type', 'state', 'month', 'seasonal_factor',
       'adjusted_spend', 'promotion_applied', 'discount_amount', 'final_spend',
       'avg_basket_size', 'avg_spend', 'discount_sensitivity',
       'basket_size_cat', 'avg_spend_cat', 'discount_sens_cat'],
      dtype='object')

In [47]:
product_relationship_features_df = add_product_relationships(all_features_df)
product_relationship_features_df.head()

Unnamed: 0,user_id,category_diversity,loyalty_index
0,user_1,121,0.070581
1,user_10,123,0.083267
2,user_11,124,0.079349
3,user_12,126,0.068309
4,user_13,126,0.069466


In [48]:
# Merge on both user_id and product_code
all_features_df = all_features_df.merge(
    product_relationship_features_df,
    on=['user_id'],   # common keys
    how='left'                        # keep all transactions, add prefs if available
)

In [49]:
all_features_df.dropna(inplace=True)
len(all_features_df) 

1338619

In [50]:
all_features_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'transaction_id', 'user_id',
       'product_code', 'category', 'item_name', 'discount_percentage',
       'transaction_date', 'transaction_price', 'age_group', 'gender',
       'income_bracket', 'customer_type', 'state', 'month', 'seasonal_factor',
       'adjusted_spend', 'promotion_applied', 'discount_amount', 'final_spend',
       'avg_basket_size', 'avg_spend', 'discount_sensitivity',
       'basket_size_cat', 'avg_spend_cat', 'discount_sens_cat',
       'category_diversity', 'loyalty_index'],
      dtype='object')

In [51]:
# Define new column order
ordered_columns = [
    # 1. Identifiers
    'transaction_id', 'user_id', 'product_code', 'item_name', 'category',
    
    # 2. Transaction details
    'transaction_date', 'month', 'seasonal_factor', 'transaction_price',
    'discount_percentage', 'discount_amount', 'promotion_applied',
    'adjusted_spend', 'final_spend',
    
    # 3. User demographics
    'age_group', 'gender', 'income_bracket', 'customer_type', 'state',
    
    # 4. Behavioral
    'avg_basket_size', 'avg_spend', 'discount_sensitivity',
    
    # 5. Categorical (binned)
    'basket_size_cat', 'avg_spend_cat', 'discount_sens_cat',
    
    # 6. Engagement & diversity
    'category_diversity', 'loyalty_index'
]

# Apply reordering
all_features_df = all_features_df[ordered_columns]

# Optional renaming for clarity
all_features_df = all_features_df.rename(columns={
    'avg_spend': 'average_spend',
    'avg_basket_size': 'average_basket_size',
    'discount_sens_cat': 'discount_sensitivity_cat',
    'avg_spend_cat': 'average_spend_cat',
    'basket_size_cat': 'basket_size_category'
})


In [52]:
print(all_features_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338619 entries, 0 to 1338618
Data columns (total 27 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   transaction_id            1338619 non-null  object 
 1   user_id                   1338619 non-null  object 
 2   product_code              1338619 non-null  int64  
 3   item_name                 1338619 non-null  object 
 4   category                  1338619 non-null  object 
 5   transaction_date          1338619 non-null  object 
 6   month                     1338619 non-null  int64  
 7   seasonal_factor           1338619 non-null  float64
 8   transaction_price         1338619 non-null  float64
 9   discount_percentage       1338619 non-null  float64
 10  discount_amount           1338619 non-null  float64
 11  promotion_applied         1338619 non-null  int64  
 12  adjusted_spend            1338619 non-null  float64
 13  final_spend               1

In [53]:
all_features_df.to_csv('data/all-featuers.csv')

In [55]:
# Get 20 random records with reproducibility
random_sample = all_features_df.sample(n=100, random_state=42)
# Display as a DataFrame
random_sample_df = pd.DataFrame(random_sample)
# Show the DataFrame
random_sample_df

Unnamed: 0,transaction_id,user_id,product_code,item_name,category,transaction_date,month,seasonal_factor,transaction_price,discount_percentage,...,customer_type,state,average_basket_size,average_spend,discount_sensitivity,basket_size_category,average_spend_cat,discount_sensitivity_cat,category_diversity,loyalty_index
306321,7bad5bcecd,user_13,2859430,Soap Free Wash,SKIN CARE,2025-03-22,3,1.2,13.298619,0.300000,...,Loyal,SA,5.488655,8.561471,0.150627,1.041039,10.917049,0.134352,126,0.069466
1060263,621567786c,user_40,3189504,Honey Soy & Garlic Stir Fry Meal Base Pouch,ASIAN FOODS,2025-03-19,3,1.2,3.557694,0.142857,...,Occasional,WA,4.546058,12.965284,0.149652,1.060606,4.781209,0.142857,121,0.076672
1072051,1f402fd2ee,user_41,5319429,Bag Fruit Pastilles,CONFECTIONERY,2023-12-19,12,1.5,5.088421,0.300000,...,Occasional,VIC,4.521000,13.005392,0.153743,1.020588,6.717970,0.144092,124,0.081033
1208266,4b79466dfe,user_45,7534182,Aerosol-Free Automatic Spray Refill Apple Blos...,AIRCARE,2022-11-06,11,1.5,11.610468,0.200000,...,Loyal,WA,5.502288,8.529293,0.148756,1.034803,8.356102,0.151345,126,0.071051
937775,517cb97033,user_36,9091672,Ice Cream Mini Almond 6Pack,ICE CREAM,2024-08-21,8,1.0,12.344283,0.300000,...,Frequent,QLD,4.486287,13.071806,0.149810,1.023474,10.903159,0.142202,124,0.082382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159192,84992687b3,user_45,8988221,Kids Chewable Multivitamins Strawberry,VITAMINS,2024-05-28,5,1.0,7.734285,0.500000,...,Loyal,WA,5.502288,8.529293,0.148756,1.124062,13.656592,0.150154,126,0.071051
130989,b47b851eab,user_5,9503543,Teevee Snacks Biscuits Malt Sticks,BISCUITS & COOKIES,2022-08-22,8,1.0,5.733913,0.200000,...,Occasional,NT,4.506006,12.918764,0.143869,1.064286,4.668505,0.127517,121,0.073226
478703,138cef462e,user_18,3034031,Mocha Flavoured Milk,DY MILK,2022-08-11,8,1.0,5.461364,0.200000,...,Occasional,NSW,4.463459,13.420779,0.150036,1.012048,4.286168,0.214286,126,0.068493
283778,159bf947cb,user_13,6314227,No.X Extreme Coffee Beans,COFFEE,2024-03-05,3,1.2,27.230782,0.300000,...,Loyal,SA,5.488655,8.561471,0.150627,1.044676,10.183968,0.157520,126,0.069466


In [56]:
random_sample_df.to_csv('data/random_sample_all_featuers_df.csv')