In [4]:
# Ensure project root on sys.path so `src` imports work
import sys
from pathlib import Path

_root = None
for p in [Path.cwd(), *Path.cwd().parents]:
    if (p / 'src').exists():
        if str(p) not in sys.path:
            sys.path.insert(0, str(p))
        _root = str(p)
        break
print('Using project root:', _root)

Using project root: d:\10  Academy\week4\assigniment\Credit-Risk-Probability-Model


In [10]:
# Imports
import os
import numpy as np
import pandas as pd
from scipy import sparse as sp

from src.data_processing import (
    load_raw,
    transform_features,
    process_raw_data,
    build_preprocessing_pipeline,
    split_features_target,
    train_val_split,
)

RAW_PATH = '../data/raw/data.csv'
TARGET = 'FraudResult'

In [13]:
# Load raw data and preview
df_raw = load_raw(RAW_PATH)
print(df_raw.shape)
df_raw.head()

(95662, 16)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [12]:
# Run the production preprocessing pipeline and inspect outputs
X_t, y, feature_names = transform_features(df_raw, target=TARGET, woe_columns=None)

print('Transformed type:', type(X_t))
print('Transformed shape:', getattr(X_t, 'shape', None))
print('Target shape:', y.shape)
print('Feature count:', len(feature_names))
print('Sample feature names:', feature_names[:15])

# For quick inspection, materialize a small dense preview if sparse
if sp.issparse(X_t):
    dense_preview = X_t[:5].toarray()
    dense_preview = pd.DataFrame(dense_preview, columns=feature_names)
    display(dense_preview.head())
else:
    display(pd.DataFrame(X_t[:5], columns=feature_names))

Transformed type: <class 'numpy.ndarray'>
Transformed shape: (95662, 22)
Target shape: (95662,)
Feature count: 22
Sample feature names: ['numeric__CountryCode', 'numeric__Amount', 'numeric__Value', 'numeric__PricingStrategy', 'numeric__transaction_hour', 'numeric__transaction_day', 'numeric__transaction_month', 'numeric__transaction_year', 'numeric__total_amount_sum', 'numeric__average_amount', 'numeric__transaction_count', 'numeric__amount_std', 'numeric__TransactionId_woe', 'numeric__BatchId_woe', 'numeric__AccountId_woe']


Unnamed: 0,numeric__CountryCode,numeric__Amount,numeric__Value,numeric__PricingStrategy,numeric__transaction_hour,numeric__transaction_day,numeric__transaction_month,numeric__transaction_year,numeric__total_amount_sum,numeric__average_amount,...,numeric__TransactionId_woe,numeric__BatchId_woe,numeric__AccountId_woe,numeric__SubscriptionId_woe,numeric__CustomerId_woe,numeric__CurrencyCode_woe,numeric__ProviderId_woe,numeric__ProductId_woe,numeric__ProductCategory_woe,numeric__ChannelId_woe
0,0.0,-0.046371,-0.072291,-0.349252,-2.15553,-0.100739,0.848684,-0.994246,0.170118,-0.067623,...,-0.044962,-0.032938,-0.784616,-0.840168,-0.242825,0.0,-0.672467,0.374772,-0.230697,0.660324
1,0.0,-0.054643,-0.080251,-0.349252,-2.15553,-0.100739,0.848684,-0.994246,0.170118,-0.067623,...,-0.044962,-0.032938,0.951039,0.914338,-0.242825,0.0,-0.450378,-0.214704,0.54909,-0.845347
2,0.0,-0.050426,-0.076352,-0.349252,-2.15553,-0.100739,0.848684,-0.994246,0.165122,-0.072568,...,-0.044962,-0.032938,0.006932,-0.034272,1.058459,0.0,-0.672467,-2.85365,-0.230697,0.660324
3,0.0,0.107717,0.096648,-0.349252,-1.949214,-0.100739,0.848684,-0.994246,0.175567,-0.008155,...,-0.044962,-0.032938,-0.573728,-0.625457,0.120727,0.0,1.96721,0.818861,0.752263,0.660324
4,0.0,-0.059704,-0.075183,-0.349252,-1.949214,-0.100739,0.848684,-0.994246,0.175567,-0.008155,...,-0.044962,-0.032938,0.951039,0.914338,0.120727,0.0,-0.450378,-0.214704,0.54909,-0.845347


In [15]:
# Optional: Save model-ready artifacts to data/processed using the same pipeline
artifacts = process_raw_data(input_path=RAW_PATH, out_dir='../data/processed', target=TARGET, woe_columns=None)
artifacts

{'features': '..\\data\\processed\\features.npy',
 'target': '..\\data\\processed\\target.csv',
 'feature_names': '..\\data\\processed\\feature_names.txt'}

## Notes
- Keep feature logic in `src/data_processing.py` (functions and `Pipeline`).
- Use this notebook to experiment and visually inspect transformations only.
- When you change the pipeline, rerun this notebook to verify shapes and feature lists.
- For unit tests, add or extend cases in `tests/test_data_processing.py`.