In [10]:
from main import load_and_preprocess_data

df_train, _ = load_and_preprocess_data("fraudTrain.csv")
df_test, _ = load_and_preprocess_data("fraudTest.csv")


Loading and preprocessing data...
Class distribution:
is_fraud
0    1289169
1       7506
Name: count, dtype: int64
Fraud percentage: 0.58%
Loading and preprocessing data...
Class distribution:
is_fraud
0    553574
1      2145
Name: count, dtype: int64
Fraud percentage: 0.39%


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from main import load_and_preprocess_data, SEQUENCE_LENGTH
from model import load_lstm_model
from predict import predict_and_analyze


fraud_users_train = df_train[df_train['is_fraud'] == 1]['cc_num'].unique()
fraud_users_test = df_test[df_test['is_fraud'] == 1]['cc_num'].unique()
common_fraud_users = set(fraud_users_train) & set(fraud_users_test)
np.random.seed(42)
fraud_users_train = np.random.choice(list(common_fraud_users), size=20, replace=True)
# Initialize lists to store probabilities
train_fraud_probs = []
train_non_fraud_probs = []
lstm_model, processor = load_lstm_model()
#make a map for the threshold per user
fraud_thresholds = {}
for cc_num in common_fraud_users:
    user_transactions = df_train[df_train['cc_num'] == cc_num]

    user_non_fraud_transactions = user_transactions[user_transactions['is_fraud'] == 0].iloc[:-1]
    user_fraud_transactions = user_transactions[user_transactions['is_fraud'] == 1]

    if not user_fraud_transactions.empty:
        fraud_transaction = user_fraud_transactions.iloc[0].to_dict()
    else:
        print(f"No fraud transactions found for cc_num: {cc_num}")
        continue

    if not user_non_fraud_transactions.empty:
        non_fraud_transaction = user_non_fraud_transactions.iloc[-1].to_dict()
    else:
        print(f"No non-fraud transactions found for cc_num: {cc_num}")
        continue

    fraud_result = predict_and_analyze(
        lstm_model,
        user_non_fraud_transactions,
        SEQUENCE_LENGTH,
        processor,
        actual_transaction=fraud_transaction,
        fraud_transactions=user_fraud_transactions
    )

    non_fraud_result = predict_and_analyze(
        lstm_model,
        user_non_fraud_transactions,
        SEQUENCE_LENGTH,
        processor,
        actual_transaction=non_fraud_transaction,
        fraud_transactions=user_fraud_transactions
    )

    train_fraud_probs.append(fraud_result['total_probability'])
    train_non_fraud_probs.append(non_fraud_result['total_probability'])
    # Store the threshold for this user
    fraud_thresholds[cc_num] = (np.percentile(train_fraud_probs, 90) + np.percentile(train_non_fraud_probs, 90)) / 2
    # Print the results
    print(f"User: {cc_num}")
    print(f"Fraud Transaction Probability: {fraud_result['total_probability']:.10e}")
    print(f"Non-Fraud Transaction Probability: {non_fraud_result['total_probability']:.10e}")
    print(f"Threshold for classification: {fraud_thresholds[cc_num]:.10e}")


avg_fraud_prob = np.mean(train_fraud_probs)
avg_non_fraud_prob = np.mean(train_non_fraud_probs)

print(f"\nAverage Fraud Probability (Train): {avg_fraud_prob:.10e}")
print(f"Average Non-Fraud Probability (Train): {avg_non_fraud_prob:.10e}")





Loading model from models/lstm_transaction_model.h5...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


  plt.savefig('prediction_distributions.png', bbox_inches='tight')


User: 4005676619255478
Fraud Transaction Probability: 1.8118749426e-17
Non-Fraud Transaction Probability: 1.6775892839e-18
Threshold for classification: 9.8981689413e-18
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
User: 3560725013359375
Fraud Transaction Probability: 2.8637358090e-16
Non-Fraud Transaction Probability: 5.5218650240e-17
Threshold for classification: 1.5470631942e-16
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
User: 3524574586339330
Fraud Transaction Probability: 2.8263140774e-19
Non-Fraud Transaction Probability: 1.5166057648e-20
Threshold for classification: 1.3861651830e-16
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
User: 4005676619255478
Fraud Transaction Probability: 1.81

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Step 3: Evaluate on fraudTest.csv for same users
y_true = []
y_pred = []

for cc_num in common_fraud_users:
    # Get the threshold for this user
    threshold = fraud_thresholds[cc_num]
    user_transactions = df_test[df_test['cc_num'] == cc_num]

    user_non_fraud_transactions = user_transactions[user_transactions['is_fraud'] == 0].iloc[:-1]
    user_fraud_transactions = user_transactions[user_transactions['is_fraud'] == 1]

    if user_fraud_transactions.empty or user_non_fraud_transactions.empty:
        print(f"No transactions found for cc_num: {cc_num}")
        continue

    fraud_transaction = user_fraud_transactions.iloc[0].to_dict()
    non_fraud_transaction = user_non_fraud_transactions.iloc[-1].to_dict()

    fraud_prob = predict_and_analyze(
        lstm_model,
        user_non_fraud_transactions,
        SEQUENCE_LENGTH,
        processor,
        actual_transaction=fraud_transaction,
        fraud_transactions=user_fraud_transactions
    )['total_probability']

    non_fraud_prob = predict_and_analyze(
        lstm_model,
        user_non_fraud_transactions,
        SEQUENCE_LENGTH,
        processor,
        actual_transaction=non_fraud_transaction,
        fraud_transactions=user_fraud_transactions
    )['total_probability']

    print(f"cc_num: {cc_num}, Fraud Probability: {fraud_prob:.10e}, Non-Fraud Probability: {non_fraud_prob:.10e}")
    # Append ground truth and predictions
    y_true.extend([1, 0])  # First is fraud, second is not
    y_pred.extend([
        1 if fraud_prob <= threshold else 0,
        1 if non_fraud_prob >= threshold else 0
    ])

# Step 4: Evaluate model with confusion matrix and classification metrics
print("\n--- Evaluation Results ---")
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
cc_num: 4005676619255478, Fraud Probability: 9.9306371168e-35, Non-Fraud Probability: 8.9285048138e-19
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
cc_num: 3560725013359375, Fraud Probability: 1.1581753517e-32, Non-Fraud Probability: 9.3617580305e-18
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
cc_num: 3524574586339330, Fraud Probability: 1.9883201887e-32, Non-Fraud Probability: 2.4444567459e-19
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
cc_num: 4005676619255478, Fraud Probability: 9.9306371168e-35, Non-Fraud Probability: 8.9285048138e-19
[1m1/1[0m 