In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.optimizers import Adam
import optuna
from optuna.integration import KerasPruningCallback

In [None]:
cred = pd.read_csv('creditcard.csv')

## Data Preparation

In [None]:
no_fraud = cred[cred['Class']==0]
fraud = cred[cred['Class']==1]

### Visualise the two classes

In [None]:
# Create an instance of t-SNE
tsne = TSNE(n_components=2)

In [None]:
plot_data = pd.concat([no_fraud.sample(3000),fraud])

In [None]:
X = plot_data.drop('Class', axis=1)
y = plot_data['Class']

In [None]:
X = tsne.fit_transform(X)

In [None]:
# Create a scatter plot of the reduced feature vectors
plt.figure(figsize=(7, 5))
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm')
plt.title('t-SNE Visualization of Fraudulent and Non-Fraudulent Cases')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')

# Customize legend labels
legend_labels = ['Non-Fraud', 'Fraud']
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels)
plt.savefig('b4.jpg')
plt.show()

### 2/3 of non frauds are reserved for train/test and 1/3 of non frauds + frauds for validation

In [None]:
# Split non-fraudulent cases into train/test and validation sets
train_test_no_fraud, val_no_fraud = train_test_split(
    no_fraud, test_size=0.33, random_state=42
)

# Combine train/test non-fraudulent cases with fraudulent cases for the validation set
val = pd.concat([val_no_fraud, fraud]).reset_index(drop=True)

X_val = val.drop('Class',axis=1)
y_val = val['Class']

# create train and test sets
X_train_test = train_test_no_fraud.drop('Class', axis=1)
y_train_test = train_test_no_fraud['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X_train_test, y_train_test, test_size=0.10, random_state=42
)

### Normalisation to [0,1]

In [None]:
scaler = MinMaxScaler()

In [None]:
# Apply the transformation to the training data
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same transformation to the test and val data
X_test_scaled = scaler.transform(X_test)

X_val_scaled = scaler.transform(X_val)

In [None]:
# save val set for streamlit viz
val = pd.DataFrame(X_val_scaled).merge(y_val,right_index=True, left_index=True)
val_st = pd.concat([val[val['Class']==0].sample(50),val[val['Class']==1].sample(50)])
val_st.drop('Class',axis=1, inplace=True)

In [None]:
val_st.to_csv('val_st.csv', index=False)

## Autoencoder

In [None]:
def objective(trial):
    # Define the hyperparameters to optimize
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    num_units_1 = trial.suggest_int('num_units_1', 50, 200)
    num_units_2 = trial.suggest_int('num_units_2', 20, 100)

    # Define the model architecture
    input_layer = Input(shape=(X_train_scaled.shape[1], ))
    encoded = Dense(num_units_1, activation='tanh')(input_layer)
    encoded = Dense(num_units_2, activation='relu')(encoded)
    decoded = Dense(num_units_2, activation='tanh')(encoded)
    decoded = Dense(num_units_1, activation='tanh')(decoded)
    output_layer = Dense(X_train_scaled.shape[1], activation='relu')(decoded)

    autoencoder = Model(input_layer, output_layer)
    autoencoder.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

    # Fit the model with early stopping based on validation loss
    autoencoder.fit(X_train_scaled,
                    X_train_scaled,
                    batch_size=256,
                    epochs=10,
                    shuffle=True,
                    validation_split=0.20,
                    callbacks=[KerasPruningCallback(trial, 'val_loss')])

    # Evaluate the model on the validation set
    val_loss = autoencoder.evaluate(X_val_scaled, X_val_scaled)

    return val_loss

# Create an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Get the best hyperparameters and the corresponding loss
best_params = study.best_params
best_loss = study.best_value

print('Best Hyperparameters:', best_params)
print('Best Loss:', best_loss)

In [None]:
# Get the best hyperparameters from the Optuna study
best_params = study.best_params

# Build the model with the best hyperparameters
input_layer = Input(shape=(X_train_scaled.shape[1], ))
encoded = Dense(best_params['num_units_1'], activation='tanh')(input_layer)
encoded = Dense(best_params['num_units_2'], activation='relu')(encoded)
decoded = Dense(best_params['num_units_2'], activation='tanh')(encoded)
decoded = Dense(best_params['num_units_1'], activation='tanh')(decoded)
output_layer = Dense(X_train_scaled.shape[1], activation='relu')(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=best_params['learning_rate']), loss='mse')

# Train the model on the entire training data
autoencoder.fit(X_train_scaled, X_train_scaled, batch_size=256, epochs=10, shuffle=True)

# Evaluate the model on the test data
test_loss = round(autoencoder.evaluate(X_test_scaled, X_test_scaled),2)
print('Test Loss:', test_loss)

## Optimizing Threshold K

In [None]:
# Reconstruct the validation set using the trained autoencoder
val_reconstructed = autoencoder.predict(X_val_scaled)

# Calculate the reconstruction errors
reconstruction_errors = np.mean(np.square(X_val_scaled - val_reconstructed), axis=1)

# Initialize the first threshold as mean plus standard deviation
initial_threshold = np.mean(reconstruction_errors) + np.std(reconstruction_errors)

# Define a range of threshold values to try
thresholds = np.arange(initial_threshold, 1.0, 0.05)

best_threshold = None
best_accuracy = 0.0

# Iterate through different threshold values
for threshold in thresholds:
    # Classify data points as normal or anomalous based on the threshold
    predictions = (reconstruction_errors > threshold).astype(int)
    
    # Calculate prediction accuracy
    accuracy = accuracy_score(y_val, predictions)
    
    # Check if the current threshold gives better accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

# Print the best threshold and accuracy
print("Best Threshold:", best_threshold)
print("Best Accuracy:", best_accuracy)

## Latent Representations

In [None]:
la_reps = Sequential()
la_reps.add(autoencoder.layers[0])
la_reps.add(autoencoder.layers[1])
la_reps.add(autoencoder.layers[2])

In [None]:
X = la_reps.predict(scaler.transform(plot_data.drop('Class', axis=1)))
rep_y1 = np.zeros(plot_data[plot_data['Class'] == 0].shape[0])
rep_y2 = np.ones(plot_data[plot_data['Class'] == 1].shape[0])
rep_y = np.append(rep_y1, rep_y2)

In [None]:
X = tsne.fit_transform(X)

# Create a scatter plot of the reduced feature vectors
plt.figure(figsize=(7, 5))
scatter = plt.scatter(X[:, 0], X[:, 1], c=rep_y, cmap='coolwarm')
plt.title('t-SNE Visualization of Reconstructed Fraudulent and Non-Fraudulent Cases')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')

# Customize legend labels
legend_labels = ['Non-Fraud', 'Fraud']
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels)
plt.savefig('afterr.jpg')
plt.show()

In [None]:
# Save the autoencoder model and weights
autoencoder.save("autoencoder.h5")
autoencoder.save_weights("auto_weights.h5")