# Task 3: Feature Engineering

This notebook implements the feature engineering steps outlined in Task 3.

## Objectives:
- Create Aggregate Features
- Extract Features (temporal)
- Encode Categorical Variables
- Handle Missing Values
- Normalize/Standardize Numerical Features
- Feature Engineering with WoE and IV

In [1]:
import pandas as pd
import numpy as np
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from utils import load_data, check_missing_values
from data_processing import (
    DateFeatureExtractor, 
    CustomerAggregator, 
    CategoricalEncoder, 
    MissingValueImputer, 
    WoEEncoder,
    calculate_iv
)

In [2]:
# Load Data
data_path = '../data/raw/data.csv'
df = load_data(data_path)
df.head()

Data loaded successfully from ../data/raw/data.csv
Shape: (95662, 16)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


## 1. Feature Extraction (Temporal Features)

In [3]:
date_extractor = DateFeatureExtractor(date_col='TransactionStartTime')
df = date_extractor.transform(df)
df[['TransactionStartTime', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']].head()

Unnamed: 0,TransactionStartTime,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,2018-11-15 02:18:49+00:00,2,15,11,2018
1,2018-11-15 02:19:08+00:00,2,15,11,2018
2,2018-11-15 02:44:21+00:00,2,15,11,2018
3,2018-11-15 03:32:55+00:00,3,15,11,2018
4,2018-11-15 03:34:21+00:00,3,15,11,2018


## 2. Missing Value Handling

In [4]:
check_missing_values(df)

# Impute missing values if any (example usage, though this dataset might be clean)
imputer = MissingValueImputer(strategy='mean')
# df = imputer.fit_transform(df) # Uncomment if needed

No missing values found.


## 3. Categorical Encoding

In [5]:
categorical_cols = ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy']
encoder = CategoricalEncoder(method='onehot', columns=categorical_cols)
encoder.fit(df)
df_encoded = encoder.transform(df)
df_encoded.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,Amount,Value,TransactionStartTime,...,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,1000.0,1000,2018-11-15 02:18:49+00:00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,-20.0,20,2018-11-15 02:19:08+00:00,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,500.0,500,2018-11-15 02:44:21+00:00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,20000.0,21800,2018-11-15 03:32:55+00:00,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,-644.0,644,2018-11-15 03:34:21+00:00,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## 4. Aggregate Features

In [6]:
aggregator = CustomerAggregator(customer_col='CustomerId', 
                                agg_cols=['Amount', 'Value'], 
                                agg_funcs=['sum', 'mean', 'count', 'std'])
df_agg = aggregator.transform(df)
df_agg.head()

Unnamed: 0,CustomerId,Amount_sum,Amount_mean,Amount_count,Amount_std,Value_sum,Value_mean,Value_count,Value_std
0,CustomerId_1,-10000.0,-10000.0,1,,10000,10000.0,1,
1,CustomerId_10,-10000.0,-10000.0,1,,10000,10000.0,1,
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,30400,6080.0,5,4100.243895
3,CustomerId_1002,4225.0,384.090909,11,560.498966,4775,434.090909,11,518.805446
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,32000,5333.333333,6,3945.461528


## 5. Weight of Evidence (WoE) and Information Value (IV)

In [7]:
# Calculate IV for features
# Note: WoE usually requires binning for continuous variables. 
# Our simple implementation handles this or assumes pre-binned/categorical.
# Let's check IV for some categorical columns

iv_series = calculate_iv(df, target_col='FraudResult')
print("Information Value (IV) by Feature:")
print(iv_series)

  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])


Information Value (IV) by Feature:
TransactionId           1265.965998
BatchId                 1253.774337
TransactionStartTime    1249.942417
AccountId                 39.977705
SubscriptionId            39.975767
CustomerId                36.865511
Amount                     5.396567
Value                      5.352100
ProductId                  3.467704
ProviderId                 3.248544
ChannelId                  1.151377
ProductCategory            0.964321
TransactionDay             0.268766
TransactionHour            0.218036
PricingStrategy            0.079995
TransactionMonth           0.060147
CurrencyCode               0.000007
TransactionYear            0.000007
CountryCode                0.000000
dtype: float64


  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])


In [8]:
# Apply WoE Encoding to a specific column as an example
woe_encoder = WoEEncoder(target_col='FraudResult')
woe_encoder.fit(df)
df_woe = woe_encoder.transform(df)
df_woe.head()

  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])


  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])
  grouped = df.groupby('feature')['target'].agg(['count', 'sum'])


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,5.105254,5.105254,1.313518,1.313518,0.727403,0.002582,256,-2.982869,-1.565057,-1.592991,0.475079,1000.0,1000,5.105254,2,0,2,15,11,2018
1,5.105254,5.105254,-3.728937,-3.78364,0.727403,0.002582,256,-2.64157,-3.783794,0.568535,-2.613741,-20.0,20,5.105254,2,0,2,15,11,2018
2,5.105254,5.105254,4.594429,4.594429,4.594429,0.002582,256,-2.982869,-2.456734,-1.592991,0.475079,500.0,500,5.105254,2,0,2,15,11,2018
3,5.105254,5.105254,2.233575,2.233575,1.860061,0.002582,256,1.627543,0.389073,1.175522,0.475079,20000.0,21800,5.105254,2,0,3,15,11,2018
4,5.105254,5.105254,-3.728937,-3.78364,1.860061,0.002582,256,-2.64157,-3.783794,0.568535,-2.613741,-644.0,644,5.105254,2,0,3,15,11,2018


## Save Processed Data

In [9]:
# Merge aggregated features back to main df if needed, or save separately
# For this task, we might want to save the feature-engineered dataset

# Example: Merging aggregated features
df_final = pd.merge(df, df_agg, on='CustomerId', how='left')

# Save
os.makedirs('../data/processed', exist_ok=True)
df_final.to_csv('../data/processed/data_featured.csv', index=False)
print("Saved processed data to ../data/processed/data_featured.csv")

Saved processed data to ../data/processed/data_featured.csv
