In [2]:
import pandas as pd
transactions=data = pd.read_csv('normalise_train.csv')


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess the dataset
# Assume `transactions` DataFrame is loaded
# Filter for relevant columns
# Update feature columns to include new features
selected_columns = [
    'amount', 'transactions_per_day', 'mean_amount', 'std_amount', 'concentration_ratio', 
    'velocity_flag', 'amount_flag', 'time_flag', 'platform', 'product_category', 'payment_method'
]

# Update the DataFrame and preprocessing
df = transactions[selected_columns]

# One-hot encode categorical columns
categorical_cols = ['platform', 'product_category', 'payment_method']
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Normalize numerical columns, including the new ones
numerical_cols = ['amount', 'transactions_per_day', 'mean_amount', 'std_amount', 'concentration_ratio']
scaler = MinMaxScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Proceed with the rest of the preprocessing and autoencoder training steps as before

# Split into features (X) and train-test sets
X = df_encoded.values

# Filter normal data (label = 0 assumed)
normal_data = transactions[transactions['label'] == 0]
X_normal = df_encoded.iloc[normal_data.index].values

# Train-test split
train_data, test_data = train_test_split(X_normal, test_size=0.2, random_state=42)

# Step 2: Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize the model
input_dim = train_data.shape[1]
latent_dim = 8  # Adjustable parameter
model = Autoencoder(input_dim=input_dim, latent_dim=latent_dim)

# Step 3: Train the Autoencoder
# Convert data to PyTorch tensors
train_data = train_data.astype(np.float32)

train_tensor = torch.tensor(train_data, dtype=torch.float32)
train_loader = DataLoader(TensorDataset(train_tensor), batch_size=32, shuffle=True)

# Define training parameters
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        batch_data = batch[0]
        optimizer.zero_grad()
        outputs = model(batch_data)
        loss = criterion(outputs, batch_data)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.6f}")

# Step 4: Calculate the reconstruction error threshold
model.eval()
with torch.no_grad():
    reconstructions = model(train_tensor).numpy()
    train_errors = np.mean((train_data - reconstructions) ** 2, axis=1)

# Set the threshold (95th percentile of reconstruction errors)
threshold = np.percentile(train_errors, 96)
print(f"Reconstruction error threshold: {threshold:.6f}")

# Step 5: Evaluate on test data
# Convert test data to PyTorch tensors
test_data = test_data.astype(np.float32)

test_tensor = torch.tensor(X, dtype=torch.float32)

with torch.no_grad():
    test_reconstructions = model(test_tensor).numpy()
    test_errors = np.mean((test_data - test_reconstructions) ** 2, axis=1)

# Classify anomalies
anomalies = test_errors > threshold
print(f"Number of anomalies detected: {np.sum(anomalies)}")

# Step 6: Evaluate on full dataset
# Prepare the full dataset (including normal and fraudulent data)

X=X.astype(np.float32)
full_tensor = torch.tensor(X, dtype=torch.float32)

with torch.no_grad():
    full_reconstructions = model(full_tensor).numpy()
    full_errors = np.mean((X - full_reconstructions) ** 2, axis=1)

# Add anomaly scores to the original dataset
df['Reconstruction_Error'] = full_errors
df['Is_Anomalous'] = full_errors > threshold

# Save results to a CSV
df.to_csv("anomaly_detection_results.csv", index=False)
print("Results saved to anomaly_detection_results.csv")


Epoch 1/50, Loss: 0.102982
Epoch 2/50, Loss: 0.017605
Epoch 3/50, Loss: 0.009301
Epoch 4/50, Loss: 0.005746
Epoch 5/50, Loss: 0.003928
Epoch 6/50, Loss: 0.003023
Epoch 7/50, Loss: 0.002550
Epoch 8/50, Loss: 0.002267
Epoch 9/50, Loss: 0.002022
Epoch 10/50, Loss: 0.001772
Epoch 11/50, Loss: 0.001528
Epoch 12/50, Loss: 0.001384
Epoch 13/50, Loss: 0.001250
Epoch 14/50, Loss: 0.001107
Epoch 15/50, Loss: 0.000909
Epoch 16/50, Loss: 0.000681
Epoch 17/50, Loss: 0.000419
Epoch 18/50, Loss: 0.000328
Epoch 19/50, Loss: 0.000290
Epoch 20/50, Loss: 0.000279
Epoch 21/50, Loss: 0.000273
Epoch 22/50, Loss: 0.000262
Epoch 23/50, Loss: 0.000255
Epoch 24/50, Loss: 0.000252
Epoch 25/50, Loss: 0.000255
Epoch 26/50, Loss: 0.000256
Epoch 27/50, Loss: 0.000253
Epoch 28/50, Loss: 0.000247
Epoch 29/50, Loss: 0.000254
Epoch 30/50, Loss: 0.000248
Epoch 31/50, Loss: 0.000245
Epoch 32/50, Loss: 0.000245
Epoch 33/50, Loss: 0.000243
Epoch 34/50, Loss: 0.000240
Epoch 35/50, Loss: 0.000246
Epoch 36/50, Loss: 0.000252
E

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Reconstruction_Error'] = full_errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Is_Anomalous'] = full_errors > threshold
