In [1]:
import sys
import os

sys.path.append(os.path.abspath("../src"))

import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [2]:
from data_processing import (
    extract_date_features,
    build_aggregate_features,
    clean_data,
    build_preprocessor
)

# Load Data
data = pd.read_csv('../data/raw/data.csv')


In [3]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [4]:
# Step 1: Extract date features
data = extract_date_features(data)

In [5]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_dayofweek
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,2018,11,15,2,3
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,-20.0,20,2018-11-15 02:19:08+00:00,2,0,2018,11,15,2,3
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,500.0,500,2018-11-15 02:44:21+00:00,2,0,2018,11,15,2,3
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,2018,11,15,3,3
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,-644.0,644,2018-11-15 03:34:21+00:00,2,0,2018,11,15,3,3


In [5]:
# Step 2: Aggregate features
data = build_aggregate_features(data)

In [7]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,FraudResult,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_dayofweek,transaction_count,total_amount,avg_amount,std_amount
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,0,2018,11,15,2,3,119,109921.75,923.712185,3042.294251
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,0,2018,11,15,2,3,119,109921.75,923.712185,3042.294251
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,0,2018,11,15,2,3,2,1000.0,500.0,0.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,0,2018,11,15,3,3,38,228727.2,6019.136842,17169.24161
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,0,2018,11,15,3,3,38,228727.2,6019.136842,17169.24161


In [6]:
# Step 3: Clean unnecessary columns
data_clean = clean_data(data)

In [9]:
data_clean.head()

Unnamed: 0,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_dayofweek,transaction_count,total_amount,avg_amount,std_amount
0,CustomerId_4406,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2,0,2018,11,15,2,3,119,109921.75,923.712185,3042.294251
1,CustomerId_4406,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2,0,2018,11,15,2,3,119,109921.75,923.712185,3042.294251
2,CustomerId_4683,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2,0,2018,11,15,2,3,2,1000.0,500.0,0.0
3,CustomerId_988,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2,0,2018,11,15,3,3,38,228727.2,6019.136842,17169.24161
4,CustomerId_988,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2,0,2018,11,15,3,3,38,228727.2,6019.136842,17169.24161


In [16]:
print("Missing values in cleaned data:")
data_clean.isnull().sum()

Missing values in cleaned data:


CustomerId                 0
ProviderId                 0
ProductId                  0
ProductCategory            0
ChannelId                  0
Amount                     0
Value                      0
PricingStrategy            0
FraudResult                0
transaction_year           0
transaction_month          0
transaction_day            0
transaction_hour           0
transaction_dayofweek      0
transaction_count          0
total_amount               0
avg_amount                 0
std_amount               712
dtype: int64

In [17]:
# Fill missing values for std_amount with 0
data_clean['std_amount'] = data_clean['std_amount'].fillna(0)

In [8]:
# Separate features and labels (assuming FraudResult is the target for testing)
X = data_clean.drop(columns=['FraudResult'])
y = data_clean['FraudResult']

In [9]:
# Step 4: Build preprocessing pipeline
preprocessor = build_preprocessor()

In [10]:
# preview the preprocessor
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function saf...0019E9B6AB560>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


##  Build Full Pipeline (Preprocessing + Model)

In [11]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

Fit the pipeline

In [12]:
pipeline.fit(X, y)

print("Pipeline trained successfully.")


Pipeline trained successfully.


Save the pipeline

In [13]:
joblib.dump(pipeline, '../models/full_pipeline.pkl')

print("Pipeline saved successfully at '../models/full_pipeline.pkl'")


Pipeline saved successfully at '../models/full_pipeline.pkl'


Test loading the pipeline

In [14]:
pipeline = joblib.load('../models/full_pipeline.pkl')

print("Pipeline loaded successfully.")

Pipeline loaded successfully.


# Explore Processed Data Shape

In [15]:
# Step 5: Fit transform data
X_transformed = preprocessor.fit_transform(X)

print(f"Transformed shape: {X_transformed.shape}")

Transformed shape: (95662, 34)


In [None]:

# Optionally convert to DataFrame to see feature names

cat_features = ['ProductCategory', 'ChannelId', 'ProviderId', 'PricingStrategy']
num_features = [
    'Amount', 'Value',
    'transaction_count', 'total_amount', 'avg_amount', 'std_amount',
    'transaction_year', 'transaction_month', 'transaction_day', 'transaction_hour', 'transaction_dayofweek'
]

# Get feature names
ohe = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = ohe.get_feature_names_out(cat_features)
all_feature_names = list(num_features) + list(cat_feature_names)

# Convert to DataFrame
X_df = pd.DataFrame(X_transformed.toarray() if hasattr(X_transformed, 'toarray') else X_transformed,
                     columns=all_feature_names)


In [15]:
X_df.head()

Unnamed: 0,Amount,Value,transaction_count,total_amount,avg_amount,std_amount,transaction_year,transaction_month,transaction_day,transaction_hour,...,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4
0,0.543642,-0.03126,0.115536,-0.015562,-0.320541,-0.527902,-0.994246,0.913619,0.209366,-3.32291,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-1.18604,-1.832223,0.115536,-0.015562,-0.320541,-0.527902,-0.994246,0.913619,0.209366,-3.32291,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.370355,-0.353843,-2.147,-1.484777,-0.628879,-7.07006,-0.994246,0.913619,0.209366,-3.32291,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.293418,1.40465,-0.573815,0.213557,0.62196,0.883382,-0.994246,0.913619,0.209366,-2.650019,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.18604,-0.236095,-0.573815,0.213557,0.62196,0.883382,-0.994246,0.913619,0.209366,-2.650019,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
