In [28]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pickle as pkl
import sys

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

import torch.utils.data as data 
import torch
from torch_geometric.data import Data
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch.utils.data import Dataset, DataLoader

import os
import torch
import torch.nn.functional as F
from tqdm import tqdm
from torch_geometric.loader import NeighborLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch_geometric.nn import MessagePassing, SAGEConv
from ogb.nodeproppred import Evaluator #PygNodePropPredDatase

from IPython.display import clear_output


from sklearn import preprocessing

import torch
import torch.nn as nn
import torch.nn.functional as F



In [29]:
# Read files
path = "C:/Users/gsamp/OneDrive/Documents/AI-3/2n Semestre/Projecte de Síntesi 2/GraphAnomaly/dades_guillem/"
df_train_init = pd.read_csv(path + "train_set.csv") 
df_test_init = pd.read_csv(path + "test_set.csv")

In [30]:
path = "C:/Users/gsamp/OneDrive/Documents/AI-3/2n Semestre/Projecte de Síntesi 2/GraphAnomaly/dades_guillem/"
df_classes = pd.read_csv(path + "elliptic_txs_classes.csv") # Nodes' labels
df_edges_init = pd.read_csv(path + "elliptic_txs_edgelist.csv") # Edges
df_features = pd.read_csv(path + "elliptic_txs_features.csv", header=None) # Nodes' features

# Change column names of df_features
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "Local_feature_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "Aggregate_feature_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

df_features = df_features.rename(columns=colNames)

In [31]:
def prep_df(feats: pd.DataFrame, edges: pd.DataFrame):
    #1 és la classe illicit, 2 la  licit
    df_feats = feats.loc[feats['class'].isin([1, 2])]
    df_feats['class'] = df_feats['class'].replace({1: 0, 2: 1})
    df_feats = df_feats.reset_index(drop=True)

    #ens quedem només amb els edges que apareixen en el nodes d'entrenament
    df_edges = edges.loc[((edges['txId1'].isin(df_feats['txId'])) & (df_edges_init['txId2'].isin(df_feats['txId'])))]
    df_edges = df_edges.reset_index(drop=True)
    print(f"contador de valors per classe: \n {df_feats['class'].value_counts()}\n")
    return  df_feats, df_edges

df_train, df_edges_train = prep_df(df_train_init, df_edges_init)
df_test, df_edges_test = prep_df(df_test_init, df_edges_init)

contador de valors per classe: 
 class
1    34654
0     2672
Name: count, dtype: int64

contador de valors per classe: 
 class
1    7365
0    1873
Name: count, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats['class'] = df_feats['class'].replace({1: 0, 2: 1})


In [32]:
def map_idx(feats: pd.DataFrame, edges: pd.DataFrame, save = True, loading_dir = "a"):
    mapping_txid = dict(zip(feats['txId'], list(feats.index)))
    dir = 'dades_guillem/' + str(loading_dir) + '.pkl'
    if save:
        df_edges_mapped = edges.replace({'txId1': mapping_txid, 'txId2': mapping_txid})
        
        df_edges_mapped.to_pickle(loading_dir)
    else:
        df_edges_mapped = pd.read_pickle(loading_dir)
    return df_edges_mapped

df_edges_mapped_train = map_idx(feats = df_train, edges = df_edges_train, save = True, loading_dir='train')
df_edges_mapped_test = map_idx(feats = df_test, edges = df_edges_test, save = True, loading_dir='train')


In [33]:
def get_data(feats: pd.DataFrame, edges:pd.DataFrame):
    x = torch.tensor(feats.drop(columns=['class', 'Time Step', 'txId']).values, dtype=torch.float)
    edge_index = torch.tensor(edges.values, dtype=torch.long).T
    y = torch.tensor(feats['class'].values)
    time = torch.tensor(feats['Time Step'].values)
    data = Data(x=x, edge_index=edge_index, y=y, time=time)
    return data


train_data = get_data(df_train, df_edges_mapped_train)
test_data = get_data(df_test, df_edges_mapped_test)

Prepare data for the AE case

In [34]:
#separate illicit and licit data
def separate_data(feats):
    licit_x = torch.tensor(feats.loc[feats['class'] == 1].drop(columns=['class', 'Time Step']).values, dtype=torch.float)
    licit_y = torch.tensor(feats.loc[feats['class'] == 1]['class'].values)
    licit_data = Data(x=licit_x, y=licit_y)

    illicit_x = torch.tensor(feats.loc[feats['class'] == 0].drop(columns=['class', 'Time Step']).values, dtype=torch.float)
    illicit_y = torch.tensor(feats.loc[feats['class'] == 0]['class'].values)
    illicit_data = Data(x=illicit_x, y=illicit_x)
    return licit_data, illicit_data

train_licit, train_illicit = separate_data(df_train)
test_licit, test_illicit = separate_data(df_test)    

In [35]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return torch.tensor(sample, dtype=torch.float32), torch.tensor(sample, dtype=torch.float32)

In [36]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim=50):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 100),
            nn.Tanh(),
            nn.Linear(100, 50),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(50, 50),
            nn.Tanh(),
            nn.Linear(50, 100),
            nn.Tanh(),
            nn.Linear(100, input_dim),
            nn.ReLU()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [37]:
def train_ae(autoencoder, data_loader, criterion, optimizer, val_loader, num_epochs=10, learning_rate=0.001):   
    total_train_loss = [] 
    total_validation_loss = []
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, targets in data_loader:
            optimizer.zero_grad()
            
            # Forward pass
            outputs = autoencoder(inputs)
            
            # Compute the loss
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        train_loss = running_loss / len(data_loader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}")
        total_train_loss.append(train_loss)
        
        #validació dins del mateix train loop
        validation_loss = val_ae(autoencoder=autoencoder, data_loader=val_loader, criterion=criterion)
        print(f"Validation Loss: {validation_loss:.4f}")
        total_validation_loss.append(validation_loss)
    return total_train_loss, total_validation_loss

def val_ae(autoencoder, data_loader, criterion):
    autoencoder.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in data_loader:
            outputs = autoencoder(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
    epoch_loss = val_loss / len(data_loader.dataset)
    return epoch_loss


In [52]:
batch_size = 256
n= 2000
#samplejar només n elements de les lícites per entrenar el AE
x = torch.tensor(df_train.loc[df_train["class"] == 1].drop(columns=['class', 'txId', 'Time Step']).values)

n = int(x.shape[0] * 0.2)
x_train = x[n:]
x_val = x[:n]
print(x_train.shape)
#amb el scaler la loss passe de 4 a 0.1
train_data = preprocessing.MinMaxScaler().fit_transform(x_train)
val_data = preprocessing.MinMaxScaler().fit_transform(x_val)

# train_data, val_data = train_test_split(x, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = CustomDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

#hiperparàmetres
lr = 0.001
EPOCHS = 50

input_dim = x_train.shape[1]
model = Autoencoder(input_dim)
criterion = nn.MSELoss()  # Mean Squared Error loss
optimizer = torch.optim.Adadelta(model.parameters(), lr=lr)



training_loss, validation_loss = train_ae(autoencoder=model, data_loader=train_loader, criterion=criterion, optimizer=optimizer, val_loader=val_loader, num_epochs=EPOCHS, learning_rate=lr)

torch.Size([27724, 165])
Epoch [1/50], Training Loss: 0.1505
Validation Loss: 0.1563
Epoch [2/50], Training Loss: 0.1502
Validation Loss: 0.1560
Epoch [3/50], Training Loss: 0.1499
Validation Loss: 0.1557
Epoch [4/50], Training Loss: 0.1496
Validation Loss: 0.1554
Epoch [5/50], Training Loss: 0.1493
Validation Loss: 0.1551
Epoch [6/50], Training Loss: 0.1490
Validation Loss: 0.1548
Epoch [7/50], Training Loss: 0.1487
Validation Loss: 0.1545
Epoch [8/50], Training Loss: 0.1484
Validation Loss: 0.1542
Epoch [9/50], Training Loss: 0.1481
Validation Loss: 0.1538
Epoch [10/50], Training Loss: 0.1478
Validation Loss: 0.1535
Epoch [11/50], Training Loss: 0.1474
Validation Loss: 0.1532
Epoch [12/50], Training Loss: 0.1471
Validation Loss: 0.1529
Epoch [13/50], Training Loss: 0.1468
Validation Loss: 0.1525
Epoch [14/50], Training Loss: 0.1465
Validation Loss: 0.1522
Epoch [15/50], Training Loss: 0.1462
Validation Loss: 0.1519
Epoch [16/50], Training Loss: 0.1458
Validation Loss: 0.1515
Epoch [1

Error de ercunstrucció

In [53]:
model.eval()

# Initialize variables for storing reconstruction error
total_loss = 0
num_batches = 0

licit_loss_history = []

# Iterate through the validation dataset
with torch.no_grad():
    for data in val_loader:
        inputs, _ = data  # Assuming your validation data is a tuple of (input, target)
        outputs = model(inputs)
        loss = criterion(outputs, inputs)  # Compute reconstruction loss
        licit_loss_history.append(float(loss.detach().numpy()))
        total_loss += loss.item()
        num_batches += 1

# Compute average reconstruction error
average_loss = total_loss / num_batches
print("Average Reconstruction Error:", average_loss)
licit_loss_history

Average Reconstruction Error: 0.14087023426379477


[0.14094895124435425,
 0.14107483625411987,
 0.13881078362464905,
 0.14152920246124268,
 0.13856807351112366,
 0.136552631855011,
 0.14153051376342773,
 0.14172132313251495,
 0.13706669211387634,
 0.1386859267950058,
 0.13743913173675537,
 0.1381182223558426,
 0.14345690608024597,
 0.14570249617099762,
 0.13780896365642548,
 0.14732250571250916,
 0.1387769877910614,
 0.14095339179039001,
 0.14010536670684814,
 0.1424579918384552,
 0.14220048487186432,
 0.14212153851985931,
 0.13945499062538147,
 0.14149416983127594,
 0.14258627593517303,
 0.13884416222572327,
 0.13924114406108856,
 0.1497928947210312]