### Feature Engineering PArt

In [2]:
# Standard imports
import pandas as pd
import sys
import os

# Allow importing from the src folder
sys.path.append(os.path.abspath("../src"))

# Import custom preprocessing functions
from data_processing import preprocess_data, build_pipeline


In [3]:
# Load the raw transaction dataset
raw_df = pd.read_csv("../data/raw/data.csv")

# Preview the data
raw_df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [4]:
# Run the preprocessing pipeline (datetime features + aggregation)
agg_df = preprocess_data(raw_df)

# Preview the output
agg_df.head()


Unnamed: 0,CustomerId,TotalAmount,AvgAmount,TxnCount,StdAmount,TotalValue,AvgValue,StdValue
0,CustomerId_1,-10000.0,-10000.0,1,,10000,10000.0,
1,CustomerId_10,-10000.0,-10000.0,1,,10000,10000.0,
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,30400,6080.0,4100.243895
3,CustomerId_1002,4225.0,384.090909,11,560.498966,4775,434.090909,518.805446
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,32000,5333.333333,3945.461528


In [5]:
# Define the columns used in feature pipeline
numeric_cols = ['TotalAmount', 'AvgAmount', 'TxnCount', 'StdAmount', 'TotalValue', 'AvgValue', 'StdValue']
categorical_cols = ['ChannelId', 'ProductCategory']  # these must be present or merged before this


In [6]:
# Merge a sample of categorical data per CustomerId
cat_cols = raw_df[['CustomerId', 'ChannelId', 'ProductCategory']].drop_duplicates('CustomerId')
agg_df = agg_df.merge(cat_cols, on='CustomerId', how='left')

# Verify merge worked
agg_df.head()


Unnamed: 0,CustomerId,TotalAmount,AvgAmount,TxnCount,StdAmount,TotalValue,AvgValue,StdValue,ChannelId,ProductCategory
0,CustomerId_1,-10000.0,-10000.0,1,,10000,10000.0,,ChannelId_2,airtime
1,CustomerId_10,-10000.0,-10000.0,1,,10000,10000.0,,ChannelId_2,airtime
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,30400,6080.0,4100.243895,ChannelId_3,financial_services
3,CustomerId_1002,4225.0,384.090909,11,560.498966,4775,434.090909,518.805446,ChannelId_3,airtime
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,32000,5333.333333,3945.461528,ChannelId_3,financial_services


In [7]:
# Create the sklearn pipeline
pipeline = build_pipeline(numeric_cols, categorical_cols)

# Fit-transform to produce the final feature matrix
X_transformed = pipeline.fit_transform(agg_df)

# Show the shape of the output
X_transformed.shape


(3742, 20)

In [9]:
# View transformed feature as DataFrame (if using OneHotEncoder, get feature names)
from sklearn.compose import make_column_selector as selector

encoder = pipeline.named_transformers_['cat'].named_steps['encoder']
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

# Combine with numeric names
feature_names = numeric_cols + list(encoded_feature_names)

# Wrap the output in a DataFrame for inspection
# Wrap the output in a DataFrame
pd.DataFrame(X_transformed, columns=feature_names).head()


Unnamed: 0,TotalAmount,AvgAmount,TxnCount,StdAmount,TotalValue,AvgValue,StdValue,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,-0.066891,-0.153364,-0.253459,-0.095504,-0.089524,-0.052297,-0.102049,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.066891,-0.153364,-0.253459,-0.095504,-0.089524,-0.052297,-0.102049,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.055849,-0.06987,-0.212186,-0.083421,-0.082011,-0.07571,-0.096217,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.061655,-0.091435,-0.150278,-0.145414,-0.091448,-0.109431,-0.133204,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.055849,-0.073846,-0.201868,-0.088882,-0.081422,-0.080169,-0.097816,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
