Create Aggregate Features

In [2]:
import pandas as pd
df = pd.read_csv('../data/data.csv')

In [3]:
customer_aggregates = df.groupby('CustomerId').agg(
    TotalTransactionAmount=('Amount', 'sum'),
    AvgTransactionAmount=('Amount', 'mean'),
    TransactionCount=('TransactionId', 'count'),
    StdDevTransactionAmount=('Amount', 'std')  
).reset_index()

customer_aggregates['StdDevTransactionAmount'].fillna(0, inplace=True)

print(customer_aggregates.describe())


       TotalTransactionAmount  AvgTransactionAmount  TransactionCount  \
count            3.742000e+03          3.742000e+03       3742.000000   
mean             1.717377e+05          1.571562e+04         25.564404   
std              2.717305e+06          1.676991e+05         96.929602   
min             -1.049000e+08         -4.250000e+05          1.000000   
25%              4.077438e+03          1.000000e+03          2.000000   
50%              2.000000e+04          2.583846e+03          7.000000   
75%              7.996775e+04          4.877614e+03         20.000000   
max              8.345124e+07          8.601821e+06       4091.000000   

       StdDevTransactionAmount  
count             3.742000e+03  
mean              1.360517e+04  
std               9.689344e+04  
min               0.000000e+00  
25%               5.011411e+02  
50%               3.184898e+03  
75%               6.745369e+03  
max               3.309916e+06  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_aggregates['StdDevTransactionAmount'].fillna(0, inplace=True)
