# Training

## Catboost

In [None]:
from fraudetect.dataset import load_data
from fraudetect.preprocessing.preprocessing import  load_workflow
from fraudetect.preprocessing import get_train_val_split
from sklearn.metrics import f1_score

from imblearn.combine import SMOTEENN, SMOTETomek

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
import numpy as np

In [None]:
cols_to_drop = [
                'CurrencyCode',
                'CountryCode',
                'BatchId',
                # 'CUSTOMER_ID',
                'TRANSACTION_ID',
                'TX_DATETIME',
                'TX_TIME_DAYS',
                'SubscriptionId',
                # 'AccountId',
                # 'CustomerUID'
                ]


interaction_cat_cols= [ 
                        'ChannelId',
                        'PricingStrategy',
                        'ProductId',
                        'ProductCategory',
                        'ProviderId'
                    ]

uid_cols=['AccountId','CUSTOMER_ID'] # [None,]

uid_col_name="CustomerUID"

cat_similarity_encode = None # ['ProductCategory',] # None

In [None]:
workflow = load_workflow(
        classifier=None,
        cols_to_drop=cols_to_drop,
        pca_n_components=20,
        detector_list=None,
        n_splits=5,
        cv_gap=5000,
        scoring="f1",
        onehot_threshold=9,
        session_gap_minutes=2448,
        uid_cols=uid_cols,
        uid_col_name=uid_col_name,
        add_fraud_rate_features = True,
        reorder_by=['TX_DATETIME','AccountId'],
        behavioral_drift_cols=[
            'CustomerUID',
        ],
        feature_selector_name = 'selectkbest', # None selectkbest smartcorrelated
        feature_select_estimator=None,
        corr_method="spearman", # spearman
        corr_threshold = 0.81,
        top_k_best=45,
        windows_size_in_days=[1, 3, 7, 30],
        cat_encoding_method= "catboost",
        cat_similarity_encode=cat_similarity_encode,
        nlp_model_name='en_core_web_md',
        add_poly_interactions=True,
        add_cum_features=True,
        n_clusters=5,
        interaction_cat_cols=interaction_cat_cols,
        poly_degree=1,
        poly_cat_encoder_name="count",
        add_fft=False,
        add_seasonal_features=False,
        use_nystrom=False,
        nystroem_components=20,
        use_sincos=True,
        use_spline=False,
        add_imputer=False,
        do_pca=False,
        n_jobs=2,
)
workflow

In [None]:
workflow_prep = workflow[:7]
workflow_prep

In [None]:
train_data = load_data(r"D:\fraud-detection-galsen\data\training.csv")

X_train, y_train, X_val, y_val = get_train_val_split(train_data=train_data,
                                                            val_window_days=30,
                                                            id_column='AccountId'
                                                        )



In [None]:
X_t = workflow_prep.fit_transform(X_train, y_train)

In [None]:
X_t.dtypes

In [None]:
X_t.info()

In [None]:
cat_features = X_t.columns.isin(X_t.select_dtypes(include=['category','string','object']).columns)
cat_features = np.where(cat_features == True)[0]
cat_features

In [None]:
X_t_val = workflow_prep.transform(X_val)

In [None]:
X_t_val.info()

In [None]:
feature_names = X_t.columns.tolist()

In [None]:
# Feature selection

train_pool = Pool(data=X_t,
          label=y_train,
          feature_names=feature_names,
          timestamp=X_train['TX_DATETIME'].diff().dt.total_seconds().fillna(0).astype(float)/60   
    )

val_pool = Pool(X_t_val,
                y_val,
                feature_names=feature_names,
                timestamp=X_val['TX_DATETIME'].diff().dt.total_seconds().fillna(0).astype(float)/60                
    )

model = CatBoostClassifier(depth=2,
                         iterations=1000,
                         eval_metric='F1:use_weights=false',
                         custom_metric=['F1:use_weights=false'],
                         early_stopping_rounds=50,
                         learning_rate=1e-1,
                         loss_function='Logloss',
                         scale_pos_weight=1e3,
                         subsample=0.5,
                         use_best_model=True,
                         rsm=0.2,
                         l2_leaf_reg=1e4
                    )

# Select features
summary = model.select_features(
    train_pool,
    eval_set=val_pool,
    features_for_select=list(range(X_t.shape[1])),
    num_features_to_select=50,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Silent',
    plot=False
)

selected_features_indices = summary['selected_features']

## get score
scores = list()
for i in range(model.tree_count_-1):
   y_pred_val = model.predict(val_pool,
                              prediction_type='Class',
                              ntree_start=i,
                              ntree_end=model.get_best_iteration()
                           )

   score = f1_score(y_true=y_val,
            y_pred=y_pred_val
            )
   
   scores.append(score)

scores[np.argmax(scores)]


In [None]:
np.argmax(scores)

In [None]:
model.get_best_iteration()

In [None]:
summary.keys()

In [None]:
selected_features_indices

In [None]:
model.eval_metrics(val_pool,
                   metrics=['F1:use_weights=false'],
                   ntree_start=0,
                   ntree_end=model.get_best_iteration()
                   )

In [None]:
model.get_best_iteration()

In [None]:
model.tree_count_

In [None]:
scores = list()
for i in range(model.tree_count_-1):
   y_pred_val = model.predict(val_pool,
                              prediction_type='Class',
                              ntree_start=i,
                              ntree_end=model.get_best_iteration()
                           )

   score = f1_score(y_true=y_val,
            y_pred=y_pred_val
            )
   
   scores.append(score)

scores[np.argmax(scores)]

In [None]:
np.argmax(scores)

In [None]:
model

In [None]:
train_pool = Pool(data=X_t,
          label=y_train.to_numpy(),
          # feature_names=feature_names,
          timestamp=X_train['TX_DATETIME'].diff().dt.total_seconds().fillna(0).astype(float)/60
     )
val_pool = Pool(X_t_val,
                y_val,
               #  feature_names=feature_names,
                timestamp=X_val['TX_DATETIME'].diff().dt.total_seconds().fillna(0).astype(float)/60                
    )
clf = CatBoostClassifier(depth=2,
                         iterations=1000,
                         eval_metric='AUC:use_weights=false',
                         custom_metric=['AUC:use_weights=false'],
                         early_stopping_rounds=50,
                         learning_rate=1e-1,
                         loss_function='Logloss',
                         scale_pos_weight=1e3,
                         has_time=True,
                         subsample=0.5,
                         use_best_model=True,
                         rsm=0.3,
                         l2_leaf_reg=1e4,
                         verbose=0
                         )

clf.fit(train_pool,
        use_best_model=True,
        eval_set=val_pool
        )

scores = list()
for i in range(model.tree_count_-1):
   y_pred_val = model.predict(val_pool,
                              prediction_type='Class',
                              ntree_start=i,
                              ntree_end=model.get_best_iteration()
                           )

   score = f1_score(y_true=y_val,
            y_pred=y_pred_val
            )
   
   scores.append(score)

scores[np.argmax(scores)],np.argmax(scores)

In [None]:
# f1_score(y_val,clf.predict(X_t_val))

In [None]:
# clf.select_features(X=train_pool,
#                     eval_set=val_pool,
#                     num_features_to_select=50,
                    
#                     )

In [None]:
# clf.get_feature_importance(data=val_pool,
#                             reference_data=train_pool,
#                             type='ShapValues'
#                            ) 

## Train-val splitting

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
import datetime

In [None]:
def generate_rolling_group_time_splits(df, date_col, group_col,
                                       val_window_days=30,
                                       n_splits=4,
                                       min_train_days=None):
    """
    Generate rolling, group-aware, time-based splits (no overlap of groups).
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).reset_index(drop=True)

    min_date = df[date_col].min()
    max_date = df[date_col].max()
    total_days = (max_date - min_date).days

    max_shift = total_days - val_window_days
    if max_shift <= 0:
        raise ValueError("Not enough span for the given val_window_days")

    shifts = np.linspace(0, max_shift, n_splits)
    splits = []

    for shift in shifts:
        val_start = min_date + timedelta(days=int(shift))
        val_end   = val_start + timedelta(days=val_window_days)

        if min_train_days is not None:
            min_train_date = val_start - timedelta(days=min_train_days)
            train_mask = (df[date_col] < val_start) & (df[date_col] >= min_train_date)
        else:
            train_mask = (df[date_col] < val_start)
        val_mask = (df[date_col] >= val_start) & (df[date_col] < val_end)

        train_groups = set(df.loc[train_mask, group_col])
        val_groups   = set(df.loc[val_mask,   group_col])
        overlap = train_groups & val_groups
        if overlap:
            val_mask &= ~df[group_col].isin(overlap)

        train_idx = df[train_mask].index.to_list()
        val_idx   = df[val_mask].index.to_list()
        splits.append((train_idx, val_idx))

    return splits

In [None]:
train_data = load_data(r"D:\fraud-detection-galsen\data\training.csv")

X_train = train_data.drop(columns=['TX_FRAUD'])
y_train = train_data['TX_FRAUD']

In [None]:
X_train.columns

In [None]:
splits = generate_rolling_group_time_splits(X_train, 'TX_DATETIME', 'AccountId',
                                    val_window_days=30,
                                    n_splits=3,
                                    min_train_days=None
                                )


In [None]:
train_idx, _ = splits[1]

_, val_idx = splits[2]

np.intersect1d(train_idx,val_idx)

In [None]:
len(train_idx), len(val_idx)

In [None]:
df_train = train_data.iloc[train_idx,:]
df_val = train_data.iloc[val_idx,:]

In [None]:
df_train['TX_DATETIME'].min(),  df_train['TX_DATETIME'].max()

In [None]:
df_val['TX_DATETIME'].min(),df_val['TX_DATETIME'].max()

In [None]:
df_train.nunique()

In [None]:
df_val.nunique()

In [None]:
np.intersect1d(df_train['AccountId'], df_val['AccountId']).shape

In [None]:
def get_train_delay_test_set(
    transactions_df,
    delta_train=7,
    delta_delay=7,
    delta_test=7,
    sampling_ratio=1.0,
    random_state=0,
):
    transactions_df.sort_values("TX_DATETIME", inplace=True, ascending=True)
    start_date_training = transactions_df["TX_DATETIME"].iloc[-1]  # last date of the dataset
    start_date_training = start_date_training + datetime.timedelta(
        days=-(delta_delay + delta_test + delta_train)
    )

    # Get the training set data
    train_df = transactions_df[
        (transactions_df.TX_DATETIME >= start_date_training)
        & (
            transactions_df.TX_DATETIME
            < start_date_training + datetime.timedelta(days=delta_train)
        )
    ]

    # Get the delay set data
    delay_df = transactions_df[
        (
            transactions_df.TX_DATETIME
            >= start_date_training + datetime.timedelta(days=delta_train)
        )
        & (
            transactions_df.TX_DATETIME
            < start_date_training
            + datetime.timedelta(days=delta_train)
            + +datetime.timedelta(days=delta_delay)
        )
    ]

    # Get the test set data
    test_df = []

    # Note: Cards known to be compromised after the delay period are removed from the test set
    # That is, for each test day, all frauds known at (test_day-delay_period) are removed

    # First, get known defrauded customers from the training set
    known_defrauded_customers = set(train_df[train_df.TX_FRAUD == 1].AccountId)

    # Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
    start_tx_time_days_training = train_df.TX_TIME_DAYS.min()

    # Then, for each day of the test set
    for day in range(delta_test):
        # Get test data for that day
        test_df_day = transactions_df[
            transactions_df.TX_TIME_DAYS
            == start_tx_time_days_training + delta_train + delta_delay + day
        ]

        # Compromised cards from that test day, minus the delay period, are added to the pool of known defrauded customers
        test_df_day_delay_period = transactions_df[
            transactions_df.TX_TIME_DAYS
            == start_tx_time_days_training + delta_train + day - 1
        ]

        new_defrauded_customers = set(
            test_df_day_delay_period[test_df_day_delay_period.TX_FRAUD == 1].AccountId
        )
        known_defrauded_customers = known_defrauded_customers.union(
            new_defrauded_customers
        )

        test_df_day = test_df_day[
            ~test_df_day.AccountId.isin(known_defrauded_customers)
        ]

        test_df.append(test_df_day)

    test_df = pd.concat(test_df)

    # If subsample
    if sampling_ratio < 1:
        train_df_frauds = train_df[train_df.TX_FRAUD == 1].sample(
            frac=sampling_ratio, random_state=random_state
        )
        train_df_genuine = train_df[train_df.TX_FRAUD == 0].sample(
            frac=sampling_ratio, random_state=random_state
        )
        train_df = pd.concat([train_df_frauds, train_df_genuine])

    # Sort data sets by ascending order of transaction ID
    train_df = train_df.sort_values("TRANSACTION_ID")
    test_df = test_df.sort_values("TRANSACTION_ID")

    return (train_df, delay_df, test_df)

In [None]:
(train_df, _ ,test_df) = get_train_delay_test_set(train_data,delta_train=40,delta_delay=10,delta_test=20,)

In [None]:
np.intersect1d(train_df['AccountId'], test_df['AccountId']).shape

In [None]:
train_df['AccountId'].nunique()

## Skorch 

In [None]:
cols_to_drop = [
                'CurrencyCode',
                'CountryCode',
                'BatchId',
                # 'CUSTOMER_ID',
                'TRANSACTION_ID',
                'TX_DATETIME',
                'TX_TIME_DAYS',
                'SubscriptionId',
                # 'AccountId',
                # 'CustomerUID'
                ]


interaction_cat_cols= [ 
                        'ChannelId',
                        'PricingStrategy',
                        'ProductId',
                        'ProductCategory',
                        'ProviderId'
                    ]

uid_cols=['AccountId','CUSTOMER_ID'] # [None,]

uid_col_name="CustomerUID"

cat_similarity_encode = None # ['ProductCategory',] # None

In [None]:
workflow = load_workflow(
        classifier=None,
        cols_to_drop=cols_to_drop,
        pca_n_components=20,
        detector_list=None,
        n_splits=5,
        cv_gap=5000,
        scoring="f1",
        onehot_threshold=9,
        session_gap_minutes=2448,
        uid_cols=uid_cols,
        uid_col_name=uid_col_name,
        add_fraud_rate_features = True,
        reorder_by=['TX_DATETIME','AccountId'],
        behavioral_drift_cols=[
            'CustomerUID',
        ],
        feature_selector_name = 'selectkbest', # None selectkbest smartcorrelated
        feature_select_estimator=None,
        corr_method="spearman", # spearman
        corr_threshold = 0.81,
        top_k_best=45,
        windows_size_in_days=[1, 3, 7, 30],
        cat_encoding_method= "woe",
        cat_similarity_encode=cat_similarity_encode,
        nlp_model_name='en_core_web_md',
        add_poly_interactions=True,
        add_cum_features=True,
        n_clusters=0,
        interaction_cat_cols=interaction_cat_cols,
        poly_degree=1,
        poly_cat_encoder_name="count",
        add_fft=False,
        add_seasonal_features=False,
        use_nystrom=False,
        nystroem_components=20,
        use_sincos=True,
        use_spline=False,
        add_imputer=False,
        do_pca=False,
        n_jobs=2,
)
workflow

In [None]:
import torch
from torch.utils.data import Dataset,DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm

class GroupedTimeSeriesDataset(Dataset):
    def __init__(self, 
                 X,
                 y,
                 t,
                 col_group,
                 sampler=None,
                 mode='train',
                 dimension=1,
                 seq_len=16, 
                 overlap=0.2):
        
        self.seq_len = seq_len
        self.samples = []
        self.mode = mode

        self.sampler = sampler

        # Calculate step size based on overlap
        step_size = max(1, int(seq_len * (1 - overlap)))

        self.dimension = dimension

        # fit preprocessor on the entire dataset     
        # print('Fitting preprocessor on the entire dataset...')  
                
        X_preprocessed = pd.DataFrame(np.array(X),columns=[f"col_{i}" for i in range(X.shape[1])]).convert_dtypes()
        X_preprocessed['group_key'] = np.array(col_group)
        X_preprocessed['TX_DATETIME'] = np.array(t) 
        X_preprocessed['TX_FRAUD'] = np.array(y) 

        self.X_preprocessed = X_preprocessed

        # print(X_preprocessed.isna().sum().sum())

        # Group by AccountId
        grouped = (self.X_preprocessed
                   .sort_values(by=['TX_DATETIME'])
                   .drop(columns=['TX_DATETIME'])
                   .groupby('group_key'))

        for _, group in tqdm(grouped,desc="Processing groups"):
            
            # Extract features and targets
            
            features = group.drop(columns=['TX_FRAUD','group_key']).values.astype(np.float32)
            targets = group['TX_FRAUD'].values.astype(np.int8)

            # Generate sliding windows
            for i in range(0, len(group) - seq_len + 1, step_size):
                x_seq = features[i:i+seq_len,:]
                y_seq = targets[i:i+seq_len]  # target at the end of the sequence
                self.samples.append((x_seq, y_seq))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):

        (x_seq, y_seq) = self.samples[idx] 
        
        # # to torch.Tensor
        x_seq = torch.tensor(x_seq, dtype=torch.float32).t()
        y_seq =  torch.tensor(y_seq, dtype=torch.float32)

        if self.dimension==2:
            x_seq = x_seq.unsqueeze(0)
            y_seq = torch.ones_like(x_seq)*y_seq

        return x_seq, y_seq


In [None]:
train_data = load_data(r"D:\fraud-detection-galsen\data\training.csv")

# X_train = train_data.drop(columns=['TX_FRAUD'])
# y_train = train_data['TX_FRAUD']

In [None]:
X_train, y_train, X_val, y_val = get_train_val_split(train_data=train_data,
                                                            val_window_days=30,
                                                            id_column='AccountId'
                                                        )

In [None]:
X_train.dtypes

In [None]:
# X_ = X_train.copy()
# X_['TX_DATETIME'] = X_['TX_DATETIME'].diff().dt.total_seconds().fillna(0)
# X_.drop(columns=['Value','TRANSACTION_ID','BatchId','TX_TIME_DAYS'],inplace=True)

In [None]:
# # resampler
# x_train_resampled, y_train_resampled = SMOTENC('auto',sampling_strategy=0.4).fit_resample(X_,
#                                                                       y_train.copy()
#                                                                       )

In [None]:
# x_train_resampled['TX_DATETIME'] = pd.to_datetime(x_train_resampled['TX_DATETIME'])

In [None]:
# x_train_resampled['Value'] = x_train_resampled['TX_AMOUNT'].apply(abs) 

In [None]:
# x_train_resampled.nunique()

In [None]:
# x_t_resampled = workflow.fit_transform(x_train_resampled, y_train_resampled)

In [None]:
X_t = workflow.fit_transform(X_train, y_train)

In [None]:
np.isnan(X_t).sum().sum()

In [None]:
# X_t = X_t.convert_dtypes()
# X_t.info()

In [None]:
# Resampling
resampler = SMOTEENN(sampling_strategy=0.15,)
resampler.fit(X_t,y_train)

In [None]:
train_dataset = GroupedTimeSeriesDataset(
                                   X=X_t,
                                   y=y_train,  
                                   t=X_train['TX_DATETIME'],  
                                   col_group=X_train['AccountId'].astype(str) + "_" + X_train['CUSTOMER_ID'].astype(str),
                                   sampler=resampler,
                                   dimension=1,                        
                                   seq_len=16,
                                   overlap=0.2
                                  )

In [None]:
train_dataset[0]

In [None]:
X_t_val = workflow.transform(X=X_val)

In [None]:
y_pred_val = clf.predict(X_t_val)

f1_score(y_pred=y_pred_val, y_true=y_val)

In [None]:
val_dataset = GroupedTimeSeriesDataset(
                                   X=X_t_val,
                                   y=y_val,  
                                   t=X_val['TX_DATETIME'],  
                                   col_group=X_val['AccountId'],    
                                   dimension=2,                    
                                   seq_len=64,
                                   overlap=0.2
                                  )

In [None]:
import torch.nn as nn
import torch
from unet.unet import UNet1D, UNet
import numpy as np
from skorch import NeuralNetClassifier, NeuralNet
from skorch.callbacks import EpochScoring,EarlyStopping
from skorch.helper import predefined_split

class UNetFraudClassifier(nn.Module):
    def __init__(self, in_channels, out_classes=1,dimensions=2, num_encoding_blocks=3):
        super().__init__()
        self.unet = UNet(
            in_channels=in_channels,
            out_classes=out_classes,
            dimensions=dimensions,
            out_channels_first_layer=2**num_encoding_blocks,
            num_encoding_blocks=num_encoding_blocks,
            dropout=0.2,
            normalization='batch',
            pooling_type='max',
            upsampling_type='conv',
            padding=1,
            activation='ReLU'
        )
        # self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.unet(x)
        return x #.squeeze(1) #, self.softmax(x)

In [None]:
unet = UNetFraudClassifier(in_channels=1, out_classes=1,num_encoding_blocks=3)

sum(p.numel() for p in unet.parameters() if p.requires_grad)

In [None]:
unet(torch.rand((1,1,64,64))).shape

In [None]:
x,y=train_dataset[0]

In [None]:
x.shape, y.shape

In [None]:
unet(x.unsqueeze(0)).shape

In [None]:
sk_unet = NeuralNet(
            UNetFraudClassifier,
            module__in_channels=1,
            module__out_classes=1,
            module__num_encoding_blocks=3,
            criterion=nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([50.])),
            optimizer=torch.optim.Adam,
            optimizer__weight_decay=1e-3,
            lr=5e-4,
            max_epochs=100,
            batch_size=16,
            warm_start=True,
            train_split=predefined_split(val_dataset),
            iterator_train__shuffle=True,
            device='cuda' if torch.cuda.is_available() else 'cpu',
            # callbacks=[EarlyStopping(monitor='valid_loss',patience=20,lower_is_better=True,load_best=True),
                    #    EpochScoring(scoring='f1',lower_is_better=False)
                       #]
        )

sk_unet.fit(train_dataset)

In [None]:
sk_unet.predict_proba(val_dataset).shape

### Dimensionality reduction

There are interesting methods to try:
- AlignedUMAP -> https://umap-learn.readthedocs.io/en/latest/aligned_umap_politics_demo.html
- autoencoders (e.g. VAE)


In [None]:
from pyod.models.vae import VAE
import umap
from fraudetect.dataset import load_data
from fraudetect.preprocessing import load_workflow
from fraudetect.config import COLUMNS_TO_DROP
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import minmax_scale
import joblib
import numpy as np

In [None]:
# load data

raw_data_train = load_data("../data/training.csv")

raw_data_pred = load_data("../data/test.csv")

In [None]:
# preprocessor
data_preprocessor = load_workflow(
    classifier=None,
    cols_to_drop=COLUMNS_TO_DROP,
    pca_n_components=80,
    detector_list=None,  # model_list,
    session_gap_minutes=60 * 3,
    uid_cols=[
        None,
    ],
    add_imputer=False,
    reorder_by=['TX_DATETIME'],
    feature_selector_name='None',  # "selectkbest",
    top_k_best=50,
    windows_size_in_days=[1, 7, 30],
    cat_encoding_method='binary',
    imputer_n_neighbors=9,
    n_clusters=0,
    do_pca=False,
    verbose=True,
    n_jobs=1,
    add_fft=False,
    add_seasonal_features=False,
    use_nystrom=False,
    nystroem_components=20,
    nystroem_kernel="poly",
    use_sincos=False,
    use_spline=True,
    spline_degree=3,
    spline_n_knots=6,
)

# Data
y_train = raw_data_train["TX_FRAUD"]
X_train = raw_data_train.drop(columns=['TX_FRAUD'])


In [None]:
y_train

In [None]:
X_train.head()

In [None]:
data_preprocessor

In [None]:
X_t = data_preprocessor.fit_transform(X=X_train,y=y_train)

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

In [None]:
def get_params(estimator:BaseEstimator|TransformerMixin):
    
    params = estimator.get_params()

    if not all([isinstance(v,BaseEstimator) for v in params.values()]):
        return params
    
    elif not all([isinstance(v,TransformerMixin) for v in params.values()]):
        return params

    elif 'steps' in params.key():
        for v in params['steps']:
            if isinstance(v,BaseEstimator) or isinstance(v,TransformerMixin):
                return get_params(v)

In [None]:
data_preprocessor[0].get_params()

In [None]:
data_preprocessor.get_params()

In [None]:
# non - fraudulent
X_normal = X_train.loc[y_train<1.,:].reset_index(drop=True)
X_normal.head()

In [None]:
y_normal = y_train.loc[y_train<1].reset_index(drop=True)
y_normal

In [None]:
X_normal_preprocessed = data_preprocessor.fit_transform(X=X_normal,y=y_normal)


X_normal_preprocessed


In [None]:
X_train_preprocessed = data_preprocessor.transform(X=X_train)
X_train_preprocessed

In [None]:
np.isnan(X_normal_preprocessed).sum()

#### VAE

In [None]:
# VAE model
vae_autoencoder = VAE(contamination=1e-3,verbose=2,epoch_num=30,
                      batch_norm=True,
                      latent_dim=10,
                      optimizer_params={'weight_decay': 1e-04},
                      output_activation_name='relu',
                      random_state=41
                      )
vae_autoencoder

In [None]:
vae_autoencoder.load('../models/vae_autoencoder.joblib')

In [None]:
# fit and save
X_normal_preprocessed_scaled = minmax_scale(X_normal_preprocessed,feature_range=(0,1))

vae_autoencoder.fit(X_normal_preprocessed_scaled)

In [None]:
# save
vae_autoencoder.save('../models/vae_autoencoder.joblib')

#### UMAP

In [None]:
# umap
import pandas as pd
from umap import AlignedUMAP

In [None]:


# 1) Suppose you have a DataFrame `df` with:
#    - 'AccountId', 'TransactionStartTime' (datetime), plus feature columns
feature_cols = ['Amount', 'TimeSinceLastTxn', 'Txn1hCount']  # your engineered features

# 2) Create two time slices
df['ts'] = pd.to_datetime(df['TransactionStartTime'])
slice1 = df[(df.ts >= '2024-01-01') & (df.ts < '2024-04-01')]
slice2 = df[(df.ts >= '2024-04-01') & (df.ts < '2024-07-01')]

# 3) Extract feature matrices and account labels
X1, ids1 = slice1[feature_cols].values, slice1['AccountId'].values
X2, ids2 = slice2[feature_cols].values, slice2['AccountId'].values

# 4) Build alignment map: list of (index_in_X1, index_in_X2) for shared accounts
alignment = []
for i, acct in enumerate(ids1):
    if acct in ids2:
        j = list(ids2).index(acct)
        alignment.append((i, j))

# 5) Run AlignedUMAP
au = AlignedUMAP(
    n_neighbors=15,
    n_components=2,
    alignment_window_size=1
)
embeddings = au.fit_transform([X1, X2], alignment=alignment)

# `embeddings` is a list of two (N1×2) and (N2×2) arrays
emb1, emb2 = embeddings

# 6) (Optional) merge back for plotting
out1 = pd.DataFrame(emb1, columns=['x','y'], index=slice1.index)
out2 = pd.DataFrame(emb2, columns=['x','y'], index=slice2.index)
viz1 = slice1.join(out1); viz2 = slice2.join(out2)


# Inference

In [None]:
import joblib
from fraudetect.config import load_args_from_json
from fraudetect.dataset import load_data
from fraudetect.preprocessing import get_train_val_split
from pathlib import Path
# from fraudetect.preprocessing import FraudFeatureEngineer, FeatureEncoding
# from fraudetect.dataset import MyDatamodule, load_data
# from fraudetect.config import Arguments
import pandas as pd
from datetime import datetime, date
from sklearn.model_selection import (TimeSeriesSplit,
                                     TunedThresholdClassifierCV)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import (
    StackingClassifier,VotingClassifier
)
from sklearn.frozen import FrozenEstimator
import os
import json
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from catboost import Pool
from sklearn.metrics import f1_score


In [41]:

clf_path = r"D:\fraud-detection-galsen\runs-optuna\ensemble-trees-3_2025-04-27_22-14_best-run.joblib"
    
clf = joblib.load(clf_path)

clf # 5 models for each cross-val split

In [None]:
# joblib.load(r"D:\fraud-detection-galsen\runs-optuna\decisionTree_2025-04-20_03-41.joblib")

In [None]:
# args, cfg = load_args_from_json(
#     r"D:\fraud-detection-galsen\runs-optuna\ensemble-trees-1_2025-04-21_20-00.json"
# )

In [None]:
# clf = run[0][0]
# clf

In [None]:
# args.__dict__

In [42]:
raw_data_train = load_data("../data/training.csv")

raw_data_pred = load_data("../data/test.csv")

In [43]:
raw_data_train.columns

Index(['TRANSACTION_ID', 'BatchId', 'AccountId', 'SubscriptionId',
       'CUSTOMER_ID', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'TX_AMOUNT', 'Value', 'TX_DATETIME',
       'PricingStrategy', 'TX_FRAUD', 'TX_TIME_DAYS'],
      dtype='object')

In [None]:
raw_data_pred.shape

In [44]:
X_train, y_train, X_val, y_val = get_train_val_split(train_data=raw_data_train,
                                                            val_window_days=30,
                                                            id_column='AccountId'
                                                        )

Number of common AccountId between train&val:  (7,)


In [None]:
preprocessor = clf[:-1]

val_pool = Pool(preprocessor.transform(X_val),
                y_val,
                timestamp=X_val['TX_DATETIME'].diff().dt.total_seconds().fillna(0).astype(float)/60
            )

In [None]:
model = clf[-1]

scores = list()
for i in range(model.tree_count_-1):
    y_pred_val = model.predict( val_pool,
                                prediction_type='Class',
                                ntree_start=i,
                                ntree_end=model.get_best_iteration()
                            )
    score = f1_score(y_true=y_val,
                y_pred=y_pred_val
                )
    scores.append(score)

fitness = scores[np.argmax(scores)] 

fitness, np.argmax(scores), model.get_best_iteration()

In [45]:
y_pred_origin = clf.predict(raw_data_pred,
                            # prediction_type='Class',
                            # ntree_start=np.argmax(scores),
                            # ntree_end=model.get_best_iteration()
                            )
y_pred_origin

array([0., 0., 0., ..., 0., 0., 0.], shape=(45019,))

In [50]:
# y_pred_origin = clf.predict(raw_data_pred)

y_pred_origin.sum()

np.float64(61.0)

In [47]:
X_train.shape,X_val.shape

((27842, 16), (8065, 16))

In [None]:
# cv=TimeSeriesSplit(n_splits=3,gap=10000)

# # Stacking
# final_estimator=LogisticRegressionCV(Cs=np.logspace(1,4,5),cv=cv,
#                                         scoring='average_precision',
#                                         solver='liblinear',
#                                         )
# clf_stacking =  StackingClassifier([(str(i),pipe) for i,pipe in enumerate(run)],
#                                     final_estimator=final_estimator,
#                                         n_jobs=5,
#                                         cv='prefit')

# clf_stacking.fit(X=X,y=y)
# y_pred_stacked = clf_stacking.predict(raw_data_pred)

In [None]:
# clf_voting =  VotingClassifier([(str(i),FrozenEstimator(pipe)) for i,pipe in enumerate(run)],
#                                voting='soft',
#                                 n_jobs=5,)

# clf_voting

In [None]:
# clf_voting.fit(X=X,y=y)

In [None]:
# y_pred_voting = clf_voting.predict_proba(raw_data_pred)

In [None]:
# y_pred_voting #= y_pred_voting.argmax(axis=1)
# y_pred_voting.sum(), y_pred_voting.sum()/y_pred_voting.shape[0]

In [None]:
# y_pred_stacked.sum(), y_pred_stacked.sum()/y_pred_stacked.shape[0]

In [None]:
# calibrated model
# clf_calibrated = CalibratedClassifierCV(FrozenEstimator(clf),
#                                  method='sigmoid',
#                                  n_jobs=2,
#                                  ensemble=True,
#                                  cv=TimeSeriesSplit(n_splits=3),
#                               )

# clf_calibrated.fit(X_val,y_val)
# y_pred_calibrated = clf_calibrated.predict(raw_data_pred)
# y_pred_calibrated.sum()

In [48]:
# tuning threshold
cfl_tuned = TunedThresholdClassifierCV(estimator=clf,
                                       scoring='f1',
                                       cv='prefit',
                                       refit=False
                                       )
cfl_tuned.fit(X_val,y_val)
y_pred_tuned = cfl_tuned.predict(raw_data_pred)
y_pred_tuned.sum()

np.float64(59.0)

In [49]:
(y_pred_tuned - y_pred_origin).sum()

np.float64(-2.0)

In [51]:
test_data = pd.read_csv("../data/test.csv")
test_data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-02-13T10:01:40Z,4
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000,2019-02-13T10:02:12Z,2
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-02-13T10:02:30Z,2
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000,2019-02-13T10:02:38Z,4
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60,2019-02-13T10:02:58Z,2


In [52]:
# make submission
submission = pd.read_csv("../data/sample_submission.csv")
submission.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,
1,TransactionId_95109,
2,TransactionId_47357,
3,TransactionId_28185,
4,TransactionId_22140,


In [53]:
## Great same...
(test_data['TransactionId'] == submission['TransactionId']).sum()

np.int64(45019)

In [54]:
tag = 'tuned'
submission['FraudResult'] = y_pred_tuned

submission['FraudResult'] = submission['FraudResult'].astype('int')
submission.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0
1,TransactionId_95109,0
2,TransactionId_47357,0
3,TransactionId_28185,0
4,TransactionId_22140,0


In [55]:
submission['FraudResult'].sum()

np.int64(59)

In [56]:
current_time = datetime.now().strftime("%H-%M")
# filename = f"submission_{str(date.today())}_{current_time}.csv"
filename = f"{Path(clf_path).stem}_{tag}.csv"
filename = os.path.join("../submissions",filename)

filename

'../submissions\\ensemble-trees-3_2025-04-27_22-14_best-run_tuned.csv'

In [57]:
submission.to_csv(filename,index=False)

# Performance estimation
public test estimation

In [None]:
from scipy.optimize import minimize, Bounds
import pandas as pd
import numpy as np

In [None]:
# load data
y1 = pd.read_csv(r'..\submissions\submission_2025-04-18_18-29_EQiuFghN.csv')['FraudResult'].to_numpy()
y2 = pd.read_csv(r'..\submissions\submission_2025-04-16_15-03_ZnyWKEKm.csv')['FraudResult'].to_numpy()
y3 = pd.read_csv(r'..\submissions\submission_2025-04-14_23-29_roKRCvYs.csv')['FraudResult'].to_numpy()
y4 = pd.read_csv(r'..\submissions\submission_2025-04-16_15-06_nDW2jSbL.csv')['FraudResult'].to_numpy()
y5 = pd.read_csv(r'..\submissions\submission_2025-04-18_18-24_mTnW2tLv.csv')['FraudResult'].to_numpy()

f1_scores = dict(roKRCvYs=(0,y3),
                 ZnyWKEKm=(0.005135337,y2),
                 nDW2jSbL=(0.26519337,y4),
                 mTnW2tLv=(0.144092219,y5),
                 EQiuFghN=(0.666666666,y1),                 
                 )

In [None]:
# from sklearn.metrics import f1_score
# def f1_score(y_truth:np.array, y_pred:np.array):

#     tp = np.dot(y_truth,y_pred)
#     fp = np.dot(1-y_truth,y_pred)
#     fn = np.dot(y_truth,1-y_pred)

#     f1 = tp/(tp + 0.5*(fp+fn) + 1e-8)

#     return f1

def objective(x_0:np.ndarray):

    x_0 = x_0.round()

    fitness = sum([(f1_score(x_0,y) - f1) for f1,y in f1_scores.values()])

    return fitness


In [None]:
x0 = sum([y for f1,y in f1_scores.values()])/len(f1_scores)
x0

In [None]:
res = minimize(objective,
         x0=x0,
        #  method='CG',
         bounds=Bounds(0,1))

In [None]:
res

In [None]:
res.x

In [None]:
f1_score(res.x,y1)