In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, confusion_matrix

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('/Users/edoardozappia/Downloads/creditcard.csv')

In [3]:
frauds = df[df.Class == 1]
normal = df[df.Class == 0]

In [4]:
data = df.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

In [5]:
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)
X_train = X_train[X_train.Class == 0]
X_train = X_train.drop(['Class'], axis=1)

print(X_train.shape)

#y_test = X_test['Class']
#X_test = X_test.drop(['Class'], axis=1)

X_train = X_train.values
#X_test = X_test.values

(227451, 29)


In [6]:
class_0 = X_test[X_test.Class == 0]
class_1 = X_test[X_test.Class == 1]

total_size = len(X_test)
desired_size_class_1 = int(total_size * 0.4)
desired_size_class_0 = total_size - desired_size_class_1

class_1_sampled = class_1.sample(n=desired_size_class_1, replace=True, random_state=42)
class_0_sampled = class_0.sample(n=desired_size_class_0, replace=True, random_state=42)

balanced_data = pd.concat([class_0_sampled, class_1_sampled])

X_test = balanced_data.drop('Class', axis=1)
y_test = balanced_data['Class']

X_test = X_test.values
y_test = y_test.values
print(y_test)

[0 0 0 ... 1 1 1]


In [7]:
print(np.count_nonzero(y_test == 0))
print(np.count_nonzero(y_test == 1))

34178
22784


In [8]:
input_dim = X_train.shape[1]
input_shape=(29,)
print(input_dim)
original_dim = input_dim  # Esempio di dimensione dell'input originale
intermediary_dims = [20, 10 ,8]
latent_dim = 2  # Esempio di dimensione dello spazio latente

29


In [9]:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import Layer
from tensorflow_probability import distributions as tfd

tfk = tf.keras
tfkl = tf.keras.layers
tfpl = tfp.layers

# Definizione della classe MultivariateNormalTriLLayer
class MultivariateNormalTriLLayer(tfkl.Layer):
    def __init__(self, latent_dim, **kwargs):
        super(MultivariateNormalTriLLayer, self).__init__(**kwargs)
        self.latent_dim = latent_dim

    def build(self, input_shape):
        self.untransformed_scale_tril = self.add_weight(name='untransformed_scale_tril',
                                                        shape=(self.latent_dim * (self.latent_dim + 1) // 2,),
                                                        initializer='random_normal',
                                                        trainable=True)
        super(MultivariateNormalTriLLayer, self).build(input_shape)

    def call(self, inputs):
        loc = inputs[..., :self.latent_dim]
        scale_tril = tfp.math.fill_triangular(self.untransformed_scale_tril)
        dist = tfd.MultivariateNormalTriL(loc=loc, scale_tril=scale_tril)
        sample = dist.sample()
        return sample  # Return the sample, not the distribution object

# Definizione della funzione dense_layers
def dense_layers(intermediary_dims):
    return tfk.Sequential([
        tfkl.Dense(units, activation='relu') for units in intermediary_dims
    ])

# Definizione del decoder come una classe Layer
class Decoder(tfkl.Layer):
    def __init__(self, original_dim, intermediary_dims, name='decoder', **kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)
        self.original_dim = original_dim
        self.decoder_hidden = tf.keras.Sequential([
            tf.keras.layers.Dense(units, activation='relu') for units in intermediary_dims
        ] + [
            tf.keras.layers.Dense(original_dim, activation='sigmoid')
        ])

    def call(self, inputs):
        return self.decoder_hidden(inputs)

# Definizione del modello VAE
class VAE(tf.keras.Model):
    def __init__(self, original_dim, intermediary_dims, latent_dim, prior, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.original_dim = original_dim
        self.intermediary_dims = intermediary_dims
        self.latent_dim = latent_dim
        self.prior = prior

        # Encoder
        self.encoder_inputs = tf.keras.Input(shape=(original_dim,), name='encoder_input')
        self.encoder = self.build_encoder()

        # Decoder
        self.decoder = Decoder(original_dim, intermediary_dims)

    def build_encoder(self):
        x = dense_layers(self.intermediary_dims)(self.encoder_inputs)
        params = tfkl.Dense(tfpl.MultivariateNormalTriL.params_size(self.latent_dim), activation=None)(x)
        z = MultivariateNormalTriLLayer(self.latent_dim)(params)
        return tf.keras.Model(self.encoder_inputs, z, name='encoder')

    def kl_divergence(self, distribution_a):
        distribution_b = self.prior
        return tfpl.KLDivergenceAddLoss(
            tfpl.MultivariateNormalTriL(),
            distribution_b,
            weight=1.0
        )(distribution_a)

    def call(self, inputs):
        z_sample = self.encoder(inputs)
        reconstructed = self.decoder(z_sample)
        return reconstructed

# Definizione dei parametri
input_dim = X_train.shape[1]
original_dim = input_dim
intermediary_dims = [20, 10, 8]
latent_dim = 2
prior = tfd.Independent(tfd.Normal(loc=tf.zeros(latent_dim), scale=1),
                        reinterpreted_batch_ndims=1)

# Creazione dell'istanza del modello VAE
vae = VAE(original_dim, intermediary_dims, latent_dim, prior)

# Compilazione del modello specificando la loss
reconstruction_loss = tf.keras.losses.MeanSquaredError()
vae.compile(optimizer=tf.keras.optimizers.Adam(), loss=reconstruction_loss)

# Allenamento del modello
vae.fit(X_train, X_train, epochs=10, batch_size=2000000)


2024-07-05 12:47:58.148118: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - loss: 1.2974
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step - loss: 1.2965
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 368ms/step - loss: 1.2956
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 553ms/step - loss: 1.2946
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step - loss: 1.2936
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - loss: 1.2926
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step - loss: 1.2915
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step - loss: 1.2903
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 343ms/step - loss: 1.2891
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - loss: 1.2879


<keras.src.callbacks.history.History at 0x145c51c40>

In [10]:
# Supponendo che vae sia il tuo modello VAE addestrato e X_test e y_test siano i tuoi dati di test
#X_test_encoded = vae.encoder.predict(X_test)

In [11]:
# Calcolo della perdita di ricostruzione su X_test
#reconstructed = vae.decoder(vae.encoder(X_test)).numpy()
#reconstruction_loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(X_test, reconstructed), axis=1)

# Definizione di una soglia arbitraria (può essere ottimizzata)
#threshold = 2.5  # Esempio di soglia, da ottimizzare

# Creazione di un array binario per le previsioni basate sulla soglia
#predictions = reconstruction_loss > threshold


In [12]:
# Calcolo della perdita di ricostruzione su X_test
reconstructed = vae.decoder(vae.encoder(X_test)).numpy()
reconstruction_loss = np.mean(np.square(X_test - reconstructed), axis=1)

# Definizione di una soglia arbitraria (può essere ottimizzata)
threshold = 2.5  # Esempio di soglia, da ottimizzare

# Creazione di un array binario per le previsioni basate sulla soglia
predictions = (reconstruction_loss > threshold).astype(np.int32)

# Definisci la funzione per calcolare le metriche in base al threshold
def calculate_metrics(y_test, predictions, reconstruction_loss):
    # Calcolo delle metriche
    auc = roc_auc_score(y_test, reconstruction_loss)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    cm = confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = cm.ravel()
    return auc, accuracy, precision, recall, f1, tn, fp, fn, tp

# Calcolo delle metriche per la soglia arbitraria
auc, accuracy, precision, recall, f1, tn, fp, fn, tp = calculate_metrics(y_test, predictions, reconstruction_loss)

print(f'Metriche con soglia arbitraria:')
print(f'AUC: {auc:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}')
print()

# Ottimizzazione del threshold utilizzando la curva ROC
fpr, tpr, thresholds_roc = roc_curve(y_test, reconstruction_loss)
roc_auc_scores = [roc_auc_score(y_test, reconstruction_loss) for threshold in thresholds_roc]
optimal_threshold_roc = thresholds_roc[np.argmax(roc_auc_scores)]

# Applica il threshold ottimizzato dalla curva ROC
predictions_roc = (reconstruction_loss > optimal_threshold_roc).astype(np.int32)

# Calcola le metriche utilizzando l'optimal_threshold_roc
auc_roc, accuracy_roc, precision_roc, recall_roc, f1_roc, tn_roc, fp_roc, fn_roc, tp_roc = calculate_metrics(y_test, predictions_roc, reconstruction_loss)

print(f'ROC Curve Optimization:')
print(f'Optimal Threshold (ROC): {optimal_threshold_roc:.4f}')
print(f'AUC (ROC): {auc_roc:.4f}')
print(f'Accuracy (ROC): {accuracy_roc:.4f}')
print(f'Precision (ROC): {precision_roc:.4f}')
print(f'Recall (ROC): {recall_roc:.4f}')
print(f'F1-score (ROC): {f1_roc:.4f}')
print(f'TN (ROC): {tn_roc}, FP (ROC): {fp_roc}, FN (ROC): {fn_roc}, TP (ROC): {tp_roc}')
print()

# Ottimizzazione del threshold utilizzando la curva precision-recall
precision, recall, thresholds_pr = precision_recall_curve(y_test, reconstruction_loss)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold_pr = thresholds_pr[np.argmax(f1_scores)]

# Applica il threshold ottimizzato dalla curva precision-recall
predictions_pr = (reconstruction_loss > optimal_threshold_pr).astype(np.int32)

# Calcola le metriche utilizzando l'optimal_threshold_pr
auc_pr, accuracy_pr, precision_pr, recall_pr, f1_pr, tn_pr, fp_pr, fn_pr, tp_pr = calculate_metrics(y_test, predictions_pr, reconstruction_loss)

print(f'Precision-Recall Curve Optimization:')
print(f'Optimal Threshold (PR): {optimal_threshold_pr:.4f}')
print(f'AUC (PR): {auc_pr:.4f}')
print(f'Accuracy (PR): {accuracy_pr:.4f}')
print(f'Precision (PR): {precision_pr:.4f}')
print(f'Recall (PR): {recall_pr:.4f}')
print(f'F1-score (PR): {f1_pr:.4f}')
print(f'TN (PR): {tn_pr}, FP (PR): {fp_pr}, FN (PR): {fn_pr}, TP (PR): {tp_pr}')


Metriche con soglia arbitraria:
AUC: 0.9529
Accuracy: 0.9116
Precision: 0.9204
Recall: 0.8527
F1-score: 0.8853
TN: 32498, FP: 1680, FN: 3355, TP: 19429

ROC Curve Optimization:
Optimal Threshold (ROC): inf
AUC (ROC): 0.9529
Accuracy (ROC): 0.6000
Precision (ROC): 0.0000
Recall (ROC): 0.0000
F1-score (ROC): 0.0000
TN (ROC): 34178, FP (ROC): 0, FN (ROC): 22784, TP (ROC): 0

Precision-Recall Curve Optimization:
Optimal Threshold (PR): 2.3579
AUC (PR): 0.9529
Accuracy (PR): 0.9264
Precision (PR): 0.9171
Recall (PR): 0.8971
F1-score (PR): 0.9070
TN (PR): 32331, FP (PR): 1847, FN (PR): 2345, TP (PR): 20439


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
