In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
# --- (Change: We will use neural network libraries) ---
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Smarter Autoencoder Model) ---")
print("Objective: Use a neural network (Autoencoder) to learn 'normal behavior' only.")

# --- Load Data ---
file_path = 'PS_20174392719_1491204439457_log24.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Building Behavioral Profiles (Full Dataset) ---
# (We need to build profiles on the full dataset for accuracy)
print("\n--- Step 1: Building Behavioral Profiles (Full) ---")
df_received = df[df['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df[df['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 2: Merge Features with Transactions ---
print("\n--- Step 2: Merging Features with Transactions ---")
df_model_data = pd.merge(df, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

# Fill NaNs from the merge
df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 3: Prepare Train/Test Data ---
print("\n--- Step 3: Preparing Train/Test Data ---")

# --- (FIX: Drop rows where the label 'isFraud' is NaN) ---
# The stratify parameter in train_test_split cannot handle NaN labels.
# These rows are unusable for training or testing anyway.
print(f"Original data size: {len(df_model_data)}")
df_model_data = df_model_data.dropna(subset=['isFraud'])
print(f"Data size after dropping NaN labels: {len(df_model_data)}")
# --- (End of Fix) ---

features_list = [
    'amount',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig'
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features_list.append('type_encoded')

X_all_features = df_model_data[features_list]
y_all_labels = df_model_data['isFraud'] # The "Correct Answer"

# (1) Scale the features
scaler = StandardScaler()
X_all_features_scaled = scaler.fit_transform(X_all_features)

# (2) Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_all_features_scaled, y_all_labels,
    test_size=0.3, random_state=42, stratify=y_all_labels
)

# (3) (Most important step) Create 'clean training data'
# The model will train *only* on the normal transactions from the training set
X_train_normal = X_train[y_train == 0]
print(f"Data split. We will train on {len(X_train_normal)} 'normal' transactions.")

# --- Step 4: Build the Autoencoder Model ---
print("\n--- Step 4: Building the Autoencoder Model ---")
input_dim = X_train_normal.shape[1] # Number of features

input_layer = Input(shape=(input_dim, ))

# Encoder (The "compression" part)
encoder = Dense(input_dim // 2, activation='relu')(input_layer) # 6 -> 3
encoder = Dense(input_dim // 4, activation='relu')(encoder)     # 3 -> 1 (The "bottleneck")

# Decoder (The "reconstruction" part)
decoder = Dense(input_dim // 2, activation='relu')(encoder)     # 1 -> 3
decoder = Dense(input_dim, activation='linear')(decoder)        # 3 -> 6 (Original shape)

# Assemble the model
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
print("Model built successfully.")
autoencoder.summary()

# --- Step 5: Train the Model (On Normal Data Only) ---
print("\n--- Step 5: Training the model (on normal data only)... ---")
# The model "learns" how to reconstruct normal training data
autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=10, # 10 training cycles (can be increased)
    batch_size=32,
    shuffle=True,
    validation_data=(X_test, X_test), # It validates on the full test set
    verbose=1
)
print("Model training complete.")

# --- Step 6: Evaluate the Model (Calculate "Reconstruction Error") ---
print("\n--- Step 6: Evaluating the Model ---")

# (a) Calculate the reconstruction error for every transaction in the test set
predictions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - predictions, 2), axis=1)
df_test = pd.DataFrame({'Reconstruction_Error': mse, 'True_Label': y_test})

# (b) Determine the anomaly "Threshold"
# We will flag anything "weirder" than 99% of the normal transactions
# (We use the clean training data to set this threshold)
train_predictions = autoencoder.predict(X_train_normal)
train_mse = np.mean(np.power(X_train_normal - train_predictions, 2), axis=1)
threshold = np.quantile(train_mse, 0.99) # Set threshold at the 99th percentile
print(f"Anomaly threshold determined at: {threshold:.4f}")

# (c) Make Predictions
# Any transaction whose "error" is higher than the threshold = anomaly (1)
y_pred = [1 if e > threshold else 0 for e in df_test['Reconstruction_Error']]

# (d) Calculate the final score
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\n--- (Final Result for Autoencoder Model) ---")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score (New Final Score): {f1:.2%}")
print("\nConfusion Matrix:")
print(cm)

--- Execution Started (Smarter Autoencoder Model) ---
Objective: Use a neural network (Autoencoder) to learn 'normal behavior' only.
Successfully loaded the full file (1048575 rows).

--- Step 1: Building Behavioral Profiles (Full) ---
Behavioral profiles created successfully.

--- Step 2: Merging Features with Transactions ---

--- Step 3: Preparing Train/Test Data ---
Original data size: 1048575
Data size after dropping NaN labels: 1048575
Data split. We will train on 733203 'normal' transactions.

--- Step 4: Building the Autoencoder Model ---
Model built successfully.



--- Step 5: Training the model (on normal data only)... ---
Epoch 1/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 2ms/step - loss: 0.9726 - val_loss: 0.9189
Epoch 2/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 2ms/step - loss: 1.0523 - val_loss: 0.9189
Epoch 3/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step - loss: 0.9841 - val_loss: 0.9188
Epoch 4/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 3ms/step - loss: 1.0685 - val_loss: 0.9189
Epoch 5/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step - loss: 0.8418 - val_loss: 0.9189
Epoch 6/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step - loss: 0.9925 - val_loss: 0.9190
Epoch 7/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step - loss: 0.9527 - val_loss: 0.9189
Epoch 8/10
[1m22913/22913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37