## Add Smart Cart Features 
<b> Objective: </b> 
<br>
Add compute recency/frequency/budget alignment, behavioral features, and product relationships.

In [26]:
# Feature Engineering
import pandas as pd
import numpy as np
from datetime import datetime

In [27]:
trans_df_with_promotions = pd.read_csv('data/trans_with_promotions.csv')

In [28]:
trans_df_with_promotions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338619 entries, 0 to 1338618
Data columns (total 21 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   Unnamed: 0.1         1338619 non-null  int64  
 1   Unnamed: 0           1338619 non-null  int64  
 2   transaction_id       1338619 non-null  object 
 3   user_id              1338619 non-null  object 
 4   product_code         1338619 non-null  int64  
 5   category             1338619 non-null  object 
 6   item_name            1338619 non-null  object 
 7   discount_percentage  1338619 non-null  float64
 8   transaction_date     1338619 non-null  object 
 9   transaction_price    1338619 non-null  float64
 10  age_group            1338619 non-null  object 
 11  gender               1338619 non-null  object 
 12  income_bracket       1338619 non-null  object 
 13  customer_type        1338619 non-null  object 
 14  state                1338619 non-null  object 
 15

In [29]:
# Group items by transaction_id to see multi-product baskets
transactions_grouped = trans_df_with_promotions.groupby('transaction_id').agg({
    'user_id': 'first',
    'transaction_date': 'first',
    'product_code': list,  # All products in transaction
    'item_name': list,
    'transaction_price': 'sum',  # Total basket value
    'discount_percentage': 'mean'  # Average discount
}).reset_index()

In [31]:
print(f'Number of trasnations = {len(trans_df_with_promotions)}')

Number of trasnations = 1338619


In [32]:
from datetime import timedelta

def compute_features(trans_df):
    # Ensure transaction_date is datetime
    trans_df['transaction_date'] = pd.to_datetime(trans_df['transaction_date'])
    
    # Reference point (latest transaction)
    today = trans_df['transaction_date'].max()

    # --- Recency ---
    recency = trans_df.groupby(['user_id','product_code'])['transaction_date'].max().reset_index()
    recency['recency_days'] = (today - recency['transaction_date']).dt.days

    # --- Frequency (last 30 days) ---
    freq_30d = trans_df[trans_df['transaction_date'] >= (today - timedelta(days=30))] \
               .groupby(['user_id','product_code']).size().rename('freq_30d')

    # --- Budget Alignment ---
    avg_spend = trans_df.groupby('user_id')['transaction_price'].mean().rename('avg_spend')
    budget = trans_df.merge(avg_spend, on='user_id')
    budget['budget_alignment'] = (
        1 - (abs(budget['transaction_price'] - budget['avg_spend']) / budget['avg_spend'])
    ).clip(lower=0)
    budget = budget.groupby(['user_id','product_code'])['budget_alignment'].mean().reset_index()

    # --- Merge all ---
    return (
        recency
        .merge(freq_30d, on=['user_id','product_code'], how='left')
        .merge(budget, on=['user_id','product_code'], how='left')
        .fillna(0)
    )


In [33]:
preference_features_df = compute_features(trans_df_with_promotions)
preference_features_df.drop(columns=["transaction_date"], inplace=True)

In [34]:
preference_features_df.head()

Unnamed: 0,user_id,product_code,recency_days,freq_30d,budget_alignment
0,user_1,8090,31,0.0,0.636839
1,user_1,26412,545,0.0,0.285238
2,user_1,26445,328,0.0,0.285955
3,user_1,49665,548,0.0,0.0
4,user_1,54492,186,0.0,0.0


In [38]:
[len(trans_df_with_promotions), len(preference_features_df)]

[1338619, 150244]

In [36]:
# Merge on both user_id and product_code
all_features_preference_features_df = trans_df_with_promotions.merge(
    preference_features_df,
    on=['user_id', 'product_code'],   # common keys
    how='left'                        # keep all transactions, add prefs if available
)

In [37]:
# drop by Name
# preference_features_df = preference_features_df.drop(['transaction_date'], axis=1)
# Get 20 random records with reproducibility
random_sample = preference_features_df[preference_features_df['freq_30d'] >= 3].sample(n=5).sort_values(by=['recency_days','budget_alignment'])
# Display as a DataFrame
random_sample_df = pd.DataFrame(random_sample)
# Show the DataFrame
random_sample_df

Unnamed: 0,user_id,product_code,recency_days,freq_30d,budget_alignment
26370,user_17,3794319,0,4.0,0.356384
140007,user_6,5241412,0,3.0,0.875054
41343,user_20,9106645,2,3.0,0.522576
33528,user_19,3762935,3,3.0,0.423514
77179,user_32,7524510,7,3.0,0.687064
