# Training

### Dimensionality reduction

There are interesting methods to try:
- AlignedUMAP -> https://umap-learn.readthedocs.io/en/latest/aligned_umap_politics_demo.html
- autoencoders (e.g. VAE)


In [8]:
from pyod.models.vae import VAE
import umap
from fraudetect.dataset import load_data
from fraudetect.preprocessing import load_workflow
from fraudetect.config import COLUMNS_TO_DROP
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import minmax_scale
import joblib
import numpy as np

KeyboardInterrupt: 

In [None]:
# load data

raw_data_train = load_data("../data/training.csv")

raw_data_pred = load_data("../data/test.csv")

In [None]:
# preprocessor
data_preprocessor = load_workflow(
    classifier=None,
    cols_to_drop=COLUMNS_TO_DROP,
    pca_n_components=80,
    detector_list=None,  # model_list,
    session_gap_minutes=60 * 3,
    uid_cols=[
        None,
    ],
    add_imputer=False,
    feature_selector_name='None',  # "selectkbest",
    top_k_best=50,
    windows_size_in_days=[1, 7, 30],
    cat_encoding_method='binary',
    imputer_n_neighbors=9,
    n_clusters=8,
    do_pca=False,
    verbose=True,
    n_jobs=1,
    add_fft=True,
    add_seasonal_features=True,
    use_nystrom=True,
    nystroem_components=50,
    nystroem_kernel="poly",
    use_sincos=False,
    use_spline=True,
    spline_degree=3,
    spline_n_knots=6,
)

# Data
y_train = raw_data_train["TX_FRAUD"]
X_train = raw_data_train.drop(columns=['TX_FRAUD'])


In [None]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
95657    0
95658    0
95659    0
95660    0
95661    0
Name: TX_FRAUD, Length: 95662, dtype: UInt8

In [None]:
X_train.head()

Unnamed: 0,TRANSACTION_ID,BatchId,AccountId,SubscriptionId,CUSTOMER_ID,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,TX_AMOUNT,Value,TX_DATETIME,PricingStrategy,TX_TIME_DAYS
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


In [None]:
data_preprocessor

In [None]:
# non - fraudulent
X_normal = X_train.loc[y_train<1.,:].reset_index(drop=True)
X_normal.head()

Unnamed: 0,TRANSACTION_ID,BatchId,AccountId,SubscriptionId,CUSTOMER_ID,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,TX_AMOUNT,Value,TX_DATETIME,PricingStrategy,TX_TIME_DAYS
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


In [None]:
y_normal = y_train.loc[y_train<1].reset_index(drop=True)
y_normal

0        0
1        0
2        0
3        0
4        0
        ..
95464    0
95465    0
95466    0
95467    0
95468    0
Name: TX_FRAUD, Length: 95469, dtype: UInt8

In [None]:
X_normal_preprocessed = data_preprocessor.fit_transform(X=X_normal,y=y_normal)


X_normal_preprocessed


array([[ 0.76916888,  0.18002487,  0.03142603, ...,  0.35776573,
         0.05049307,  0.05722582],
       [ 0.41847867,  0.05709926,  0.0475924 , ...,  0.19432161,
         0.10949048,  0.08487911],
       [ 0.93809573,  0.25286235,  0.00542369, ...,  0.33325395,
         0.02135384,  0.03912823],
       ...,
       [ 0.18559425,  0.06968068,  0.12318148, ...,  0.23606422,
         0.12954559,  0.06435663],
       [ 0.23986373,  0.24921077,  0.0509406 , ...,  0.38084406,
         0.03301102, -0.01905907],
       [ 0.15204788,  0.1134163 ,  0.09622628, ...,  0.20249949,
         0.13248942,  0.03683468]], shape=(95469, 50))

In [None]:
X_train_preprocessed = data_preprocessor.transform(X=X_train)
X_train_preprocessed

array([[ 0.76889268,  0.17963444,  0.03134885, ...,  0.35851931,
         0.05062638,  0.0569887 ],
       [ 0.41842482,  0.05710548,  0.04838061, ...,  0.19480482,
         0.10745772,  0.08567429],
       [ 0.92347213,  0.23308547,  0.02245489, ...,  0.38520772,
        -0.02654172,  0.04769175],
       ...,
       [ 0.18558832,  0.06968151,  0.12323673, ...,  0.23609363,
         0.12942678,  0.06440814],
       [ 0.24076195,  0.25094222,  0.05133487, ...,  0.37812979,
         0.03218484, -0.01785956],
       [ 0.15205988,  0.11341369,  0.09612219, ...,  0.20244566,
         0.13272039,  0.03673383]], shape=(95662, 50))

In [None]:
np.isnan(X_normal_preprocessed).sum()

np.int64(0)

#### VAE

In [None]:
# VAE model
vae_autoencoder = VAE(contamination=1e-3,verbose=2,epoch_num=30,
                      batch_norm=True,
                      latent_dim=10,
                      optimizer_params={'weight_decay': 1e-04},
                      output_activation_name='relu',
                      random_state=41
                      )
vae_autoencoder

In [None]:
vae_autoencoder.load('../models/vae_autoencoder.joblib')

In [None]:
# fit and save
X_normal_preprocessed_scaled = minmax_scale(X_normal_preprocessed,feature_range=(0,1))

vae_autoencoder.fit(X_normal_preprocessed_scaled)

Epoch 1/30, loss=0.9424, time=26.02s
Epoch 2/30, loss=0.7807, time=25.72s
Epoch 3/30, loss=0.7444, time=25.83s
Epoch 4/30, loss=0.7230, time=28.23s
Epoch 5/30, loss=0.7007, time=28.42s
Epoch 6/30, loss=0.6910, time=27.28s
Epoch 7/30, loss=0.6866, time=28.75s
Epoch 8/30, loss=0.6810, time=33.94s
Epoch 9/30, loss=0.6769, time=30.95s
Epoch 10/30, loss=0.6723, time=26.25s
Epoch 11/30, loss=0.6686, time=25.82s
Epoch 12/30, loss=0.6659, time=26.46s
Epoch 13/30, loss=0.6651, time=26.82s
Epoch 14/30, loss=0.6614, time=26.67s
Epoch 15/30, loss=0.6606, time=26.83s
Epoch 16/30, loss=0.6628, time=25.90s
Epoch 17/30, loss=0.6607, time=26.40s
Epoch 18/30, loss=0.6582, time=25.75s
Epoch 19/30, loss=0.6580, time=27.41s
Epoch 20/30, loss=0.6592, time=27.32s
Epoch 21/30, loss=0.6578, time=26.69s
Epoch 22/30, loss=0.6560, time=26.70s
Epoch 23/30, loss=0.6560, time=27.08s
Epoch 24/30, loss=0.6571, time=27.17s
Epoch 25/30, loss=0.6541, time=27.79s
Epoch 26/30, loss=0.6529, time=28.19s
Epoch 27/30, loss=0.6

In [None]:
# save
vae_autoencoder.save('../models/vae_autoencoder.joblib')

#### UMAP

In [None]:
# umap
import pandas as pd
from umap import AlignedUMAP

In [None]:


# 1) Suppose you have a DataFrame `df` with:
#    - 'AccountId', 'TransactionStartTime' (datetime), plus feature columns
feature_cols = ['Amount', 'TimeSinceLastTxn', 'Txn1hCount']  # your engineered features

# 2) Create two time slices
df['ts'] = pd.to_datetime(df['TransactionStartTime'])
slice1 = df[(df.ts >= '2024-01-01') & (df.ts < '2024-04-01')]
slice2 = df[(df.ts >= '2024-04-01') & (df.ts < '2024-07-01')]

# 3) Extract feature matrices and account labels
X1, ids1 = slice1[feature_cols].values, slice1['AccountId'].values
X2, ids2 = slice2[feature_cols].values, slice2['AccountId'].values

# 4) Build alignment map: list of (index_in_X1, index_in_X2) for shared accounts
alignment = []
for i, acct in enumerate(ids1):
    if acct in ids2:
        j = list(ids2).index(acct)
        alignment.append((i, j))

# 5) Run AlignedUMAP
au = AlignedUMAP(
    n_neighbors=15,
    n_components=2,
    alignment_window_size=1
)
embeddings = au.fit_transform([X1, X2], alignment=alignment)

# `embeddings` is a list of two (N1×2) and (N2×2) arrays
emb1, emb2 = embeddings

# 6) (Optional) merge back for plotting
out1 = pd.DataFrame(emb1, columns=['x','y'], index=slice1.index)
out2 = pd.DataFrame(emb2, columns=['x','y'], index=slice2.index)
viz1 = slice1.join(out1); viz2 = slice2.join(out2)


# Inference

**TODO**: calibrate the classifier
- https://scikit-learn.org/stable/modules/calibration.html#calibration

In [1]:
import joblib
from fraudetect.config import load_args_from_json
# from fraudetect.preprocessing import FraudFeatureEngineer, FeatureEncoding
# from fraudetect.dataset import MyDatamodule, load_data
# from fraudetect.config import Arguments
import pandas as pd
from datetime import datetime, date
import os

In [2]:
run = joblib.load(
    "../runs-optuna/small-models_2025-04-16_best-run.joblib"
)
# results, transform_pipe, datamodule, selector = run

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
run

AttributeError: 'FraudFeatureEngineer' object has no attribute 'add_fraud_rate_features'

In [None]:
results=run[0]
results

In [None]:
clf = results.best_estimator_
clf

In [None]:
args, cfg = load_args_from_json(
    "../tools/runs-optuna/small-models_2025-04-16_11-37.json"
)

In [None]:
args

Arguments(data_path='../data/training.csv', study_name='small-models_2025-04-16_11-37', work_dir='runs-optuna', run_name='debug', delta_train=50, delta_delay=7, delta_test=20, random_state=41, windows_size_in_days=[1, 7, 30], sampler_names=None, sampler_cfgs=None, model_names=['decisionTree', 'logisticReg', 'svc'], session_gap_minutes=180, onehot_threshold=9, pyod_detectors=['abod', 'cblof', 'hbos', 'iforest', 'knn', 'loda', 'mcd'], disable_pyod_outliers=False, disable_samplers=True, do_pca=True, do_poly_expansion=False, do_feature_selection=True, cv_n_iter=200, cv_gap=5255, cv_method='optuna', n_splits=3, n_jobs=4, scoring='f1', cat_encoding_method='binary', cat_encoding_base_n=4, cat_encoding_hash_method='md5', cat_encoding_hash_n_components=7, add_imputer=False, concat_features=[None], optuna_n_trials=20, cols_to_drop=['CurrencyCode', 'CountryCode', 'SubscriptionId', 'BatchId', 'CUSTOMER_ID', 'AccountId', 'TRANSACTION_ID', 'TX_DATETIME', 'TX_TIME_DAYS'])

In [None]:
raw_data_train = load_data("../data/training.csv")

raw_data_pred = load_data("../data/test.csv")

step: load data
step: load data
There is no column FraudResult in loaded data.


In [None]:
raw_data_train.shape

(95662, 17)

In [None]:
raw_data_pred.shape

(45019, 16)

In [None]:
y_pred = clf.predict(raw_data_pred)

In [None]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.], shape=(45019,))

In [None]:
y_pred.sum(), y_pred.sum()/y_pred.shape[0]

(np.float64(298.0), np.float64(0.006619427352895444))

In [None]:
results, transform_pipe, col_transformer = run

for k,v in results.items():
    try:
        print(v.best_score_)
    except:
        pass

In [None]:
test_data = pd.read_csv("../data/test.csv")
test_data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-02-13T10:01:40Z,4
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000,2019-02-13T10:02:12Z,2
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-02-13T10:02:30Z,2
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000,2019-02-13T10:02:38Z,4
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60,2019-02-13T10:02:58Z,2


In [None]:
# make submission
submission = pd.read_csv("../data/sample_submission.csv")
submission.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,
1,TransactionId_95109,
2,TransactionId_47357,
3,TransactionId_28185,
4,TransactionId_22140,


In [None]:
## Great same...
(test_data['TransactionId'] == submission['TransactionId']).sum()

np.int64(45019)

In [None]:
submission['FraudResult'] = y_pred

submission['FraudResult'] = submission['FraudResult'].astype('int')
submission.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0
1,TransactionId_95109,0
2,TransactionId_47357,0
3,TransactionId_28185,0
4,TransactionId_22140,0


In [None]:
submission['FraudResult'].sum()

np.int64(298)

In [None]:
current_time = datetime.now().strftime("%H-%M")
filename = f"submission_{str(date.today())}_{current_time}.csv"
filename = os.path.join("../data",filename)

submission.to_csv(filename,index=False)