In [130]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [131]:
df = pd.read_csv('../data/transactions_with_zscores.csv')
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,zscore_Day,zscore_amount_to_balance_ratio,flag_zscore_TransactionAmount,flag_zscore_TransactionDuration,flag_zscore_LoginAttempts,flag_zscore_AccountBalance,flag_zscore_time_diff,flag_zscore_Hour,flag_zscore_Day,flag_zscore_amount_to_balance_ratio
0,469,34,0.037361,2024-11-04 08:06:23,0,29,395,188,99,2,...,,-0.03921,0,0,0,0,0,0,0,0
1,2044,237,0.098158,2024-11-04 08:06:23,1,4,436,24,82,2,...,,-0.038574,0,0,0,0,0,0,0,0
2,1252,455,0.19766,2024-11-04 08:06:23,0,16,648,25,39,1,...,,-0.038446,0,0,0,0,0,0,0,0
3,35,263,0.064883,2024-11-04 08:06:23,1,39,41,479,1,1,...,,-0.038731,0,0,0,0,0,0,0,0
4,2157,153,0.107846,2024-11-04 08:06:23,1,8,586,340,27,2,...,,-0.038599,0,0,0,0,0,0,0,0


In [132]:
df.columns

Index(['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate',
       'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID',
       'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration',
       'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate',
       'time_diff', 'location_change', 'Hour', 'Day', 'DayOfWeek',
       'high_amount_flag', 'many_login_attempts_flag', 'long_time_diff_flag',
       'amount_to_balance_ratio', 'amount_exceeds_balance',
       'suspicious_merchant_flag', 'reactivation_suspect_flag',
       'duration_suspicious_flag', 'age_balance_anomaly_flag',
       'strong_anomaly_label', 'zscore_TransactionAmount',
       'zscore_TransactionDuration', 'zscore_LoginAttempts',
       'zscore_AccountBalance', 'zscore_time_diff', 'zscore_Hour',
       'zscore_Day', 'zscore_amount_to_balance_ratio',
       'flag_zscore_TransactionAmount', 'flag_zscore_TransactionDuration',
       'flag_zscore_LoginAttempts', 'flag_zscore_

In [133]:
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,zscore_Day,zscore_amount_to_balance_ratio,flag_zscore_TransactionAmount,flag_zscore_TransactionDuration,flag_zscore_LoginAttempts,flag_zscore_AccountBalance,flag_zscore_time_diff,flag_zscore_Hour,flag_zscore_Day,flag_zscore_amount_to_balance_ratio
0,469,34,0.037361,2024-11-04 08:06:23,0,29,395,188,99,2,...,,-0.03921,0,0,0,0,0,0,0,0
1,2044,237,0.098158,2024-11-04 08:06:23,1,4,436,24,82,2,...,,-0.038574,0,0,0,0,0,0,0,0
2,1252,455,0.19766,2024-11-04 08:06:23,0,16,648,25,39,1,...,,-0.038446,0,0,0,0,0,0,0,0
3,35,263,0.064883,2024-11-04 08:06:23,1,39,41,479,1,1,...,,-0.038731,0,0,0,0,0,0,0,0
4,2157,153,0.107846,2024-11-04 08:06:23,1,8,586,340,27,2,...,,-0.038599,0,0,0,0,0,0,0,0


In [134]:
features = [
    'TransactionAmount',
    'TransactionDuration',
    'LoginAttempts',
    'AccountBalance',
    'time_diff',
    'amount_to_balance_ratio'
]
X = df[features]

In [135]:
iso_model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
df['isolation_pred'] = iso_model.fit_predict(X)

In [136]:
df['isolation_outlier_label'] = (df['isolation_pred'] == -1).astype(int)

In [137]:
all_flag_candidates = [
    'high_amount_flag', 'many_login_attempts_flag', 'long_time_diff_flag',
    'amount_exceeds_balance', 'suspicious_merchant_flag', 'reactivation_suspect_flag',
    'duration_suspicious_flag', 'age_balance_anomaly_flag',
    'flag_zscore_TransactionAmount', 'flag_zscore_TransactionDuration',
    'flag_zscore_LoginAttempts', 'flag_zscore_AccountBalance',
    'flag_zscore_time_diff', 'flag_zscore_amount_to_balance_ratio',
    'isolation_outlier_label', 'strong_anomaly_label',
    'strong_anomaly_label'
]

In [138]:
for flag in all_flag_candidates:
    df[flag] = (df[flag] > 0).astype(int)

In [139]:
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,flag_zscore_TransactionAmount,flag_zscore_TransactionDuration,flag_zscore_LoginAttempts,flag_zscore_AccountBalance,flag_zscore_time_diff,flag_zscore_Hour,flag_zscore_Day,flag_zscore_amount_to_balance_ratio,isolation_pred,isolation_outlier_label
0,469,34,0.037361,2024-11-04 08:06:23,0,29,395,188,99,2,...,0,0,0,0,0,0,0,0,1,0
1,2044,237,0.098158,2024-11-04 08:06:23,1,4,436,24,82,2,...,0,0,0,0,0,0,0,0,1,0
2,1252,455,0.19766,2024-11-04 08:06:23,0,16,648,25,39,1,...,0,0,0,0,0,0,0,0,1,0
3,35,263,0.064883,2024-11-04 08:06:23,1,39,41,479,1,1,...,0,0,0,0,0,0,0,0,1,0
4,2157,153,0.107846,2024-11-04 08:06:23,1,8,586,340,27,2,...,0,0,0,0,0,0,0,0,1,0


In [140]:
flag_performance = []
for flag in all_flags:
    precision = precision_score(df['strong_anomaly_label'], df[flag])
    recall = recall_score(df['strong_anomaly_label'], df[flag])
    f1 = f1_score(df['strong_anomaly_label'], df[flag])
    flag_performance.append({
        'Flag': flag,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [141]:
flag_perf_df = pd.DataFrame(flag_performance).sort_values(by='F1 Score', ascending=False)
print("Flag Performance Metrics:")
print(flag_perf_df.to_string(index=False))

Flag Performance Metrics:
                               Flag  Precision   Recall  F1 Score
             amount_exceeds_balance   1.000000 0.771911  0.871275
           suspicious_merchant_flag   1.000000 0.153115  0.265568
                   high_amount_flag   1.000000 0.144667  0.252768
           many_login_attempts_flag   1.000000 0.128828  0.228251
          flag_zscore_LoginAttempts   1.000000 0.100317  0.182342
           duration_suspicious_flag   0.365482 0.076030  0.125874
      flag_zscore_TransactionAmount   1.000000 0.050686  0.096482
flag_zscore_amount_to_balance_ratio   1.000000 0.003168  0.006316
           age_balance_anomaly_flag   0.230769 0.003168  0.006250
                long_time_diff_flag   0.000000 0.000000  0.000000
          reactivation_suspect_flag   0.000000 0.000000  0.000000
    flag_zscore_TransactionDuration   0.000000 0.000000  0.000000
         flag_zscore_AccountBalance   0.000000 0.000000  0.000000
              flag_zscore_time_diff   0.000000 0.0

In [142]:
selected_flags = [
    'amount_exceeds_balance',
    'suspicious_merchant_flag',
    'high_amount_flag',
    'many_login_attempts_flag',
    'flag_zscore_LoginAttempts',
    'flag_zscore_TransactionAmount',
    'flag_zscore_amount_to_balance_ratio'
]

In [143]:
df['final_anomaly_flag'] = (df[selected_flags].sum(axis=1) >= 1).astype(int)

In [144]:
df.to_csv('../data/transactions_with_all_flags_final.csv.csv', index=False)