Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import torch
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [2]:
dataroot = path_append + "../data/credit_card_fraud_detection/creditcard.csv"
df = pd.read_csv(dataroot)
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [3]:
print('No Frauds', round(df['Class'].value_counts()[0] / len(df) *100,2), '%of the dataset')
print('Frauds', round(df['Class'].value_counts()[1] / len(df) *100,2), '%of the dataset')

No Frauds 99.83 %of the dataset
Frauds 0.17 %of the dataset


In [4]:
# https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_7_DeepLearning/FeedForwardNeuralNetworks.html
class LabeledDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype = torch.float32)
        return vals, label

class UnlabelledDataset(torch.utils.data.Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        return vals, None

sc = StandardScaler()
df.iloc[:, :-1] = sc.fit_transform(df.iloc[:, :-1])

In [5]:
# number of features
n_elements = df.shape[1]
# number of label classes
# n_classes = y.shape[1]

In [6]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

from trainer_hub import TrainerHub


In [7]:
data_config = DataConfig(dataset_name = 'CreditCardFraudDetection', task_type='augmentation', obs_shape=[n_elements], label_size=None)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters()

In [8]:
from tools.setting.ml_params import TabularModelParams

ml_params.core_model_name = 'none' 
ml_params.encoder_model_name = 'tabular'
ml_params.encoding_params = TabularModelParams(dropout=0.05)
ml_params.training.max_epoch = 4
# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False, use_full_eval=False) 

In [9]:
df_train, df_test = train_test_split(df, test_size = 0.5, shuffle= False)
X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1:].values
X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1:].values
_df_train = df_train.iloc[:, :].values 

unlabelled_trainset = UnlabelledDataset(_df_train)
trainset = LabeledDataset(X_test, y_test)
testset = LabeledDataset(X_test, y_test)

In [10]:
trainer_hub.train(unlabelled_trainset)

Epochs:   0%|          | 0/4 [00:00<?, ?it/s]

Iterations:   0%|          | 0/2225 [00:00<?, ?it/s]

[0/4][50/2225][Time 0.92]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.5647	Gen: 0.5245	Rec: 0.5736	E: 0.5156	R: 0.6138	P: 0.5334
[0/4][100/2225][Time 0.78]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.5552	Gen: 0.4696	Rec: 0.4684	E: 0.5564	R: 0.5540	P: 0.3828
[0/4][150/2225][Time 0.82]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.5483	Gen: 0.4573	Rec: 0.3894	E: 0.6162	R: 0.4805	P: 0.2984
[0/4][200/2225][Time 0.80]
Unified LR across all optimizers: 0.00019815726328921765
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.4659	Gen: 0.4350	Rec: 0.3619	E: 0.5390	R: 0.3928	P: 0.3311
[0/4][250/2225][Time 0.88]
Unified LR across all optimizers: 0.00019770151423055492
------------------

Iterations:   0%|          | 0/2225 [00:00<?, ?it/s]

[1/4][25/2225][Time 0.82]
Unified LR across all optimizers: 0.00018030592393534033
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0970	Gen: 0.1036	Rec: 0.1046	E: 0.0959	R: 0.0980	P: 0.1112
[1/4][75/2225][Time 0.82]
Unified LR across all optimizers: 0.0001798912318178735
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0991	Gen: 0.1050	Rec: 0.1060	E: 0.0981	R: 0.1000	P: 0.1119
[1/4][125/2225][Time 0.78]
Unified LR across all optimizers: 0.00017947749346581006
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0951	Gen: 0.1026	Rec: 0.1023	E: 0.0954	R: 0.0948	P: 0.1097
[1/4][175/2225][Time 0.80]
Unified LR across all optimizers: 0.0001790647066855505
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.1013	Gen: 0.1081	Rec: 0.1084	E: 0.1010	R: 0.1016	P: 0.1151
[1/4][225/2225][Time 0.79]
Unified LR across all optimizers: 0.00017865286928854052
--------------------

Iterations:   0%|          | 0/2225 [00:00<?, ?it/s]

[2/4][0/2225][Time 0.89]
Unified LR across all optimizers: 0.00016293335327318117
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0800	Gen: 0.0873	Rec: 0.0854	E: 0.0819	R: 0.0781	P: 0.0926
[2/4][50/2225][Time 0.78]
Unified LR across all optimizers: 0.00016255861695947546
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0750	Gen: 0.0834	Rec: 0.0796	E: 0.0788	R: 0.0713	P: 0.0880
[2/4][100/2225][Time 0.81]
Unified LR across all optimizers: 0.00016218474251537463
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0779	Gen: 0.0863	Rec: 0.0837	E: 0.0805	R: 0.0752	P: 0.0922
[2/4][150/2225][Time 0.80]
Unified LR across all optimizers: 0.00016181172795863357
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0739	Gen: 0.0831	Rec: 0.0788	E: 0.0782	R: 0.0697	P: 0.0880
[2/4][200/2225][Time 0.86]
Unified LR across all optimizers: 0.0001614395713115662
--------------------

Iterations:   0%|          | 0/2225 [00:00<?, ?it/s]

[3/4][25/2225][Time 0.91]
Unified LR across all optimizers: 0.00014689600866445298
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0695	Gen: 0.0815	Rec: 0.0716	E: 0.0794	R: 0.0596	P: 0.0836
[3/4][75/2225][Time 0.80]
Unified LR across all optimizers: 0.00014655815721980301
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0639	Gen: 0.0758	Rec: 0.0666	E: 0.0731	R: 0.0547	P: 0.0785
[3/4][125/2225][Time 0.79]
Unified LR across all optimizers: 0.00014622108281191326
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0684	Gen: 0.0802	Rec: 0.0706	E: 0.0780	R: 0.0589	P: 0.0824
[3/4][175/2225][Time 0.77]
Unified LR across all optimizers: 0.00014588478365364866
--------------------Training Metrics--------------------
Trainer:  tabular
Inf: 0.0706	Gen: 0.0823	Rec: 0.0732	E: 0.0796	R: 0.0615	P: 0.0850
[3/4][225/2225][Time 0.78]
Unified LR across all optimizers: 0.0001455492579619846
-------------------

In [11]:
batch_size = 64  # Lower than the original batch size
# Use DataLoader to handle smaller batches
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    X, y = zip(*batch)
    # Directly use the tensors from X if they are already tensors, else convert appropriately
    X_padded = pad_sequence([x.clone().detach() if isinstance(x, torch.Tensor) else torch.tensor(x) for x in X], batch_first=True, padding_value=0)
    
    if any(label is None for label in y):
        y_padded = None
    else:
        # Directly use the tensors from y if they are already tensors, else convert appropriately
        y_padded = pad_sequence([label.clone().detach() if isinstance(label, torch.Tensor) else torch.tensor(label) for label in y], batch_first=True, padding_value=-1)
    
    return X_padded, y_padded

In [12]:

train_loader = torch.utils.data.DataLoader(dataset=unlabelled_trainset, batch_size=batch_size, collate_fn = collate_fn, shuffle=False)
# Example: Reduce batch size
recreated_dataset = None
for data, _ in train_loader:
    data = data.to(device)
    batch_recreated_data = trainer_hub.encoder_ccnet.synthesize(data, output_multiplier=2)
    recreated_dataset = torch.cat([recreated_dataset, batch_recreated_data]) if recreated_dataset is not None else batch_recreated_data
recreated_dataset.squeeze_(dim=1)

tensor([[-1.9901e+00, -6.9761e-01,  4.3261e-02,  ..., -7.6110e-02,
          1.8244e-01,  2.3873e-03],
        [-1.9154e+00,  5.8628e-01,  1.8738e-01,  ...,  7.0969e-02,
         -3.2018e-01,  8.0240e-03],
        [-1.7592e+00, -7.6837e-01, -8.2039e-01,  ..., -1.2940e-01,
          1.1841e+00, -3.4368e-04],
        ...,
        [-2.4652e-01, -1.1312e+00,  6.7068e-01,  ..., -4.5710e-01,
         -2.5605e-01,  4.6941e-04],
        [-1.8864e-01, -1.1131e+00,  1.3040e+00,  ..., -6.7787e-02,
         -3.3815e-01, -7.8926e-03],
        [-2.8286e-01, -5.0602e-01,  3.3722e-01,  ..., -1.1421e+00,
         -2.6214e-01,  3.7897e-03]], device='cuda:0')

In [13]:
# seperate the data and labels
recreated_training_data, recreated_labels = recreated_dataset[:, :-1].clone().detach().cpu().numpy(), recreated_dataset[:, -1:].clone().detach().cpu().numpy()
ccnet_recreated_dataset = LabeledDataset(recreated_training_data, recreated_labels)

In [14]:
num_features = recreated_training_data.shape[1]
num_classes = recreated_labels.shape[1]
num_features, num_classes

(30, 1)

In [15]:
class DNN(torch.nn.Module):
    def __init__(self, input_size, output_size, num_layers=4, hidden_size=128):
        super(DNN, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        # Create a list to hold all layers
        layers = []
        
        # Input layer
        layers.append(torch.nn.Linear(input_size, hidden_size))
        layers.append(torch.nn.ReLU())
        
        # Hidden layers
        for _ in range(num_layers - 2):
            layers.append(torch.nn.Linear(hidden_size, hidden_size))
            layers.append(torch.nn.ReLU())
        
        # Output layer
        layers.append(torch.nn.Linear(hidden_size, output_size))
        layers.append(torch.nn.Sigmoid())
        
        # Register all layers
        self.layers = torch.nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [16]:
def train_supervised_model(model, trainset):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    set_random_seed(0)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    for epoch in range(2):
        for i, (data, label) in enumerate(trainloader):
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            loss = torch.nn.functional.binary_cross_entropy(output, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

In [17]:
model_trained_on_original = DNN(input_size= num_features, output_size=num_classes).to(device)

train_supervised_model(model_trained_on_original, trainset)

In [18]:
model_trained_on_recreated = DNN(input_size= num_features, output_size=num_classes).to(device)

train_supervised_model(model_trained_on_recreated, ccnet_recreated_dataset)

In [19]:
from sklearn.metrics import f1_score

def get_f1_score(model, testset, batch_size=batch_size):
    model.eval()  # Set the model to evaluation mode
    y_true = []
    y_pred = []
    data_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():  # No need to track gradients for inference
        for data, label in data_loader:
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            # Ensure output is squeezed, thresholded, and converted to long for binary classification
            predicted = (output.squeeze() > 0.5).long()
            # Make sure label is also in the correct format (long type)
            y_true.extend(label.squeeze().long().cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Compute F1 score, using 'binary' because this is a binary classification task
    score = f1_score(y_true, y_pred, average='binary')
    return score

# Usage example with two models:
f1_score_original = get_f1_score(model_trained_on_original, testset)
f1_score_recreated = get_f1_score(model_trained_on_recreated, testset)

print("F1 score of the supervised learning model trained on the original data: ", f1_score_original)
print("F1 score of the supervised learning model trained on the recreated data: ", f1_score_recreated)


F1 score of the supervised learning model trained on the original data:  0.8620689655172413
F1 score of the supervised learning model trained on the recreated data:  0.7634408602150538
