Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import torch
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [2]:
dataroot = path_append + "../data/credit_card_fraud_detection/creditcard.csv"
df = pd.read_csv(dataroot)
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [3]:
print('No Frauds', round(df['Class'].value_counts()[0] / len(df) *100,2), '%of the dataset')
print('Frauds', round(df['Class'].value_counts()[1] / len(df) *100,2), '%of the dataset')

No Frauds 99.83 %of the dataset
Frauds 0.17 %of the dataset


In [4]:
# https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_7_DeepLearning/FeedForwardNeuralNetworks.html
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype = torch.float32).unsqueeze(-1)
        return vals, label

y = df[['Class']]
X = df.drop(['Class'],axis=1)

sc = RobustScaler()
X['scaled_amount'] = sc.fit_transform(X['Amount'].values.reshape(-1,1))
X['scaled_time'] = sc.fit_transform(X['Time'].values.reshape(-1,1))
X.drop(['Time','Amount'], axis=1, inplace=True)
X = X[:]

In [5]:
# number of features
n_features = X.shape[1]
# number of label classes
n_classes = y.shape[1]

In [6]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

from trainer_hub import TrainerHub


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, shuffle= True)

X_train = X_train.iloc[:, :].values 
X_test = X_test.iloc[:, :].values 
y_train = y_train.iloc[:, -1].values
y_test = y_test.iloc[:, -1].values

trainset = Dataset(X_train, y_train)
testset = Dataset(X_test, y_test)

In [8]:
data_config = DataConfig(dataset_name = 'CreditCardFraudDetection', task_type='binary_classification', obs_shape=[n_features], label_size=n_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters()

In [9]:
ml_params.core_model_name = 'gpt' 
ml_params.encoder_model_name = 'none'
ml_params.training.max_epoch = 1
ml_params.learning_rate = 1e-4
ml_params.decay_rate_100k = 0.001
# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False, use_full_eval=False) 

In [10]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Iterations:   0%|          | 0/2225 [00:00<?, ?it/s]

[0/1][50/2225][Time 4.16]
Unified LR across all optimizers: 9.96483243133418e-05
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.2189	Gen: 0.5123	Rec: 0.5639	E: 0.1673	R: 0.2705	P: 0.8572
--------------------Test Metrics------------------------
accuracy: 1.0000
precision: 1.0000
recall: 1.0000
f1_score: 1.0000

[0/1][100/2225][Time 3.47]
Unified LR across all optimizers: 9.930474487640473e-05
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0013	Gen: 0.2997	Rec: 0.2996	E: 0.0014	R: 0.0011	P: 0.5980
--------------------Test Metrics------------------------
accuracy: 1.0000
precision: 1.0000
recall: 1.0000
f1_score: 1.0000

[0/1][150/2225][Time 3.68]
Unified LR across all optimizers: 9.896235007383356e-05
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0015	Gen: 0.2427	Rec: 0.2424	E: 0.0018	R: 0.0012	P: 0.4837
--------------------Test Metrics------------------------
accuracy: 1.0000
precision: 1.0000


### Data Preparation and Synthetic Data Generation

In this section of the notebook, we are performing a series of operations to prepare our training data and utilize a causal cooperative network (CCNet) to generate synthetic data based on the explanations derived from the original training data. Each step is described as follows:

1. **Data Loading**:
   - `training_data, training_labels = trainset[:]`
   This line extracts all the features and labels from `trainset`, which is presumably a pre-loaded dataset formatted for training. Here, slicing `[:]` is used to retrieve all data and labels without any modifications.

2. **Device Assignment**:
   - `training_data = training_data.to(device)`
   - `training_labels = training_labels.to(device)`
   These lines transfer the training data and labels to a designated computing device (`device`). This device could be a CPU or a GPU and is typically specified to optimize computational efficiency. Moving data to the device ensures that all subsequent operations that require computation can leverage hardware acceleration.

3. **Data Explanation**:
   - `explanation = trainer_hub.core_ccnet.explain(training_data)`
   Here, the `explain` method of the `core_ccnet` module within `trainer_hub` is called with the training data. This function is expected to analyze the data and provide an "explanation" for each instance, which could be feature importances or another form of interpretable output that explains why certain predictions might be made from the data.

4. **Synthetic Data Generation**:
   - `recreated_data, recreated_label = trainer_hub.core_ccnet.generate(explanation)`
   This line generates synthetic data and labels by feeding the explanations obtained from the original data into the `generate` method of `core_ccnet`. The generate method uses the explanations to create new data instances that mimic or expand upon the patterns found in the original dataset. This is particularly useful for enhancing dataset diversity, balancing classes, or improving model robustness by providing additional training samples.

By the end of this process, `recreated_data` and `recreated_label` contain newly generated data and labels that can be used for further training, testing, or analysis to enhance the model's performance or robustness against various types of data inputs.


In [None]:
training_data, training_labels = trainset[:]
training_data = training_data.to(device)
training_labels = training_labels.to(device)
explanation = trainer_hub.core_ccnet.explain(training_data)
recreated_data, recreated_label = trainer_hub.core_ccnet.generate(explanation)
recreated_label

In [None]:
# Assuming recreated_data is a PyTorch tensor and y is the labels associated with the data
recreated_data.squeeze_(dim=1)
recreated_label.squeeze_()

# Convert recreated_data to a NumPy array
recreated_data_data_np = recreated_data.cpu().detach().numpy()
recreated_label_data_np = recreated_label.cpu().detach().numpy()

# Create the dataset using the converted data and labels
ccnet_balanced_dataset = Dataset(recreated_data_data_np, recreated_label_data_np)


In [None]:
class DNN(torch.nn.Module):
    def __init__(self, input_size, output_size, num_layers=3, hidden_size=128):
        super(DNN, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        # Create a list to hold all layers
        layers = []
        
        # Input layer
        layers.append(torch.nn.Linear(input_size, hidden_size))
        layers.append(torch.nn.ReLU())
        
        # Hidden layers
        for _ in range(num_layers - 2):
            layers.append(torch.nn.Linear(hidden_size, hidden_size))
            layers.append(torch.nn.ReLU())
        
        # Output layer
        layers.append(torch.nn.Linear(hidden_size, output_size))
        layers.append(torch.nn.Sigmoid())
        
        # Register all layers
        self.layers = torch.nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


In [None]:
def train_supervised_model(model, trainset):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    set_random_seed(0)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
    for epoch in range(1):
        for i, (data, label) in enumerate(trainloader):
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            loss = torch.nn.functional.binary_cross_entropy(output, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

In [None]:
model_trained_on_original = DNN(input_size= n_features, output_size=n_classes).to(device)

train_supervised_model(model_trained_on_original, trainset)

In [None]:
model_trained_on_recreated = DNN(input_size= n_features, output_size=n_classes).to(device)

train_supervised_model(model_trained_on_recreated, ccnet_balanced_dataset)

In [None]:
from sklearn.metrics import f1_score
import torch

def get_f1_score(model, testset):
    model.eval()  # Set the model to evaluation mode
    y_true = []
    y_pred = []
    
    with torch.no_grad():  # No need to track gradients for inference
        test_data, test_labels = testset[:]  # Assuming this retrieves all data and labels
        test_data = test_data.to(device)  # Ensure the data is on the correct device
        test_labels = test_labels.to(device)
        outputs = model(test_data)
        
        # Assuming model outputs a single probability per instance, apply threshold
        predicted = (outputs.squeeze() > 0.5).long()  # Convert probabilities to 0 or 1 based on threshold
        
        y_true.extend(test_labels.cpu().numpy())  # Move labels back to CPU and convert to numpy
        y_pred.extend(predicted.cpu().numpy())  # Move predictions back to CPU and convert to numpy

    # Compute F1 score, using 'binary' because this is a binary classification task
    score = f1_score(y_true, y_pred, average='binary')
    return score

# Usage example with two models:
f1_score_original = get_f1_score(model_trained_on_original, testset)
f1_score_recreated = get_f1_score(model_trained_on_recreated, testset)

print("F1 score of the supervised learning model trained on the original data: ", f1_score_original)
print("F1 score of the supervised learning model trained on the recreated data: ", f1_score_recreated)