In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [2]:
df = pd.read_csv('../data/clean_transactions_with_flags.csv')
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,high_amount_flag,many_login_attempts_flag,long_time_diff_flag,amount_to_balance_ratio,amount_exceeds_balance,suspicious_merchant_flag,reactivation_suspect_flag,duration_suspicious_flag,age_balance_anomaly_flag,strong_anomaly_label
0,469,34,0.037361,2024-11-04 08:06:23,0,29,395,188,99,2,...,0,0,0,0.060637,0,0,0,0,0,0
1,2044,237,0.098158,2024-11-04 08:06:23,1,4,436,24,82,2,...,0,0,0,0.206647,0,0,0,0,0,0
2,1252,455,0.19766,2024-11-04 08:06:23,0,16,648,25,39,1,...,0,0,0,0.235796,0,0,0,0,0,0
3,35,263,0.064883,2024-11-04 08:06:23,1,39,41,479,1,1,...,0,0,0,0.170428,0,0,0,0,0,0
4,2157,153,0.107846,2024-11-04 08:06:23,1,8,586,340,27,2,...,0,0,0,0.200754,0,1,0,0,0,1


In [3]:
df.columns

Index(['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate',
       'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID',
       'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration',
       'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate',
       'time_diff', 'location_change', 'Hour', 'Day', 'DayOfWeek',
       'high_amount_flag', 'many_login_attempts_flag', 'long_time_diff_flag',
       'amount_to_balance_ratio', 'amount_exceeds_balance',
       'suspicious_merchant_flag', 'reactivation_suspect_flag',
       'duration_suspicious_flag', 'age_balance_anomaly_flag',
       'strong_anomaly_label'],
      dtype='object')

In [4]:
selected_columns_for_zscore = [
    'TransactionAmount',
    'TransactionDuration',
    'LoginAttempts',
    'AccountBalance',
    'time_diff',
    'Hour',
    'Day',
    'amount_to_balance_ratio'
]

In [5]:
z_scores = df[selected_columns_for_zscore].apply(zscore)
z_scores.columns = ['zscore_' + col for col in selected_columns_for_zscore]

In [6]:
z_flags = (z_scores.abs() > 3).astype(int)
z_flags.columns = ['flag_zscore_' + col for col in selected_columns_for_zscore]

In [7]:
df_zscore = pd.concat([df, z_scores, z_flags], axis=1)

In [8]:
df_zscore.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,zscore_Day,zscore_amount_to_balance_ratio,flag_zscore_TransactionAmount,flag_zscore_TransactionDuration,flag_zscore_LoginAttempts,flag_zscore_AccountBalance,flag_zscore_time_diff,flag_zscore_Hour,flag_zscore_Day,flag_zscore_amount_to_balance_ratio
0,469,34,0.037361,2024-11-04 08:06:23,0,29,395,188,99,2,...,,-0.03921,0,0,0,0,0,0,0,0
1,2044,237,0.098158,2024-11-04 08:06:23,1,4,436,24,82,2,...,,-0.038574,0,0,0,0,0,0,0,0
2,1252,455,0.19766,2024-11-04 08:06:23,0,16,648,25,39,1,...,,-0.038446,0,0,0,0,0,0,0,0
3,35,263,0.064883,2024-11-04 08:06:23,1,39,41,479,1,1,...,,-0.038731,0,0,0,0,0,0,0,0
4,2157,153,0.107846,2024-11-04 08:06:23,1,8,586,340,27,2,...,,-0.038599,0,0,0,0,0,0,0,0


In [9]:
df_zscore.columns

Index(['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate',
       'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID',
       'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration',
       'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate',
       'time_diff', 'location_change', 'Hour', 'Day', 'DayOfWeek',
       'high_amount_flag', 'many_login_attempts_flag', 'long_time_diff_flag',
       'amount_to_balance_ratio', 'amount_exceeds_balance',
       'suspicious_merchant_flag', 'reactivation_suspect_flag',
       'duration_suspicious_flag', 'age_balance_anomaly_flag',
       'strong_anomaly_label', 'zscore_TransactionAmount',
       'zscore_TransactionDuration', 'zscore_LoginAttempts',
       'zscore_AccountBalance', 'zscore_time_diff', 'zscore_Hour',
       'zscore_Day', 'zscore_amount_to_balance_ratio',
       'flag_zscore_TransactionAmount', 'flag_zscore_TransactionDuration',
       'flag_zscore_LoginAttempts', 'flag_zscore_

In [10]:
df_zscore.describe()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,...,zscore_Day,zscore_amount_to_balance_ratio,flag_zscore_TransactionAmount,flag_zscore_TransactionDuration,flag_zscore_LoginAttempts,flag_zscore_AccountBalance,flag_zscore_time_diff,flag_zscore_Hour,flag_zscore_Day,flag_zscore_amount_to_balance_ratio
count,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,...,0.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0,2512.0
mean,1255.5,246.637739,0.154954,0.773885,21.032643,337.732882,297.083201,48.523885,0.991242,0.430225,...,,9.19293e-18,0.019108,0.0,0.037818,0.0,0.000398,0.0,0.0,0.001194
std,725.296261,143.35339,0.152146,0.418398,12.253549,198.210671,169.659435,29.03247,0.8091,0.286971,...,,1.000199,0.136933,0.0,0.190795,0.0,0.019952,0.0,0.0,0.034544
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,-0.03947488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,627.75,123.0,0.042538,1.0,11.0,166.75,150.0,24.0,0.0,0.145161,...,,-0.03884623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1255.5,250.0,0.109899,1.0,21.0,335.5,294.0,48.0,1.0,0.435484,...,,-0.0376782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1883.25,369.0,0.215894,1.0,31.0,510.0,447.0,73.0,2.0,0.66129,...,,-0.03403916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2511.0,494.0,1.0,1.0,42.0,680.0,591.0,99.0,2.0,1.0,...,,43.29613,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [11]:
df_zscore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 47 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   TransactionID                        2512 non-null   int64  
 1   AccountID                            2512 non-null   int64  
 2   TransactionAmount                    2512 non-null   float64
 3   TransactionDate                      2512 non-null   object 
 4   TransactionType                      2512 non-null   int64  
 5   Location                             2512 non-null   int64  
 6   DeviceID                             2512 non-null   int64  
 7   IP Address                           2512 non-null   int64  
 8   MerchantID                           2512 non-null   int64  
 9   Channel                              2512 non-null   int64  
 10  CustomerAge                          2512 non-null   float64
 11  CustomerOccupation            

In [12]:
df_zscore.to_csv("../data/transactions_with_zscores.csv", index=False)