In [2]:
# Importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
from datetime import datetime
pd.set_option("display.max_rows", 100)

In [None]:
class Data_Processor:
    '''
    Class to read and flag anonymous transaction
    based on historical data analysis.
    '''

    def __init__(self, data: pd.DataFrame)-> pd.DataFrame:
        '''
        load and log transform data.
        '''
        self.data= data
        self.data.drop(columns='Unnamed: 0', inplace= True)
        self.data.set_index('timestamp', drop= False, inplace= True)
        self.data['amount'] = np.log(self.data['amount'])
        self.data['timestamp'] = pd.to_datetime(self.data['timestamp'])
        self.data['month']= self.data['timestamp'].dt.month
        self.data['week']= np.int64(self.data['timestamp'].dt.strftime('%U'))
        self.data['day']= self.data['timestamp'].dt.day
        self.data['week_day']= self.data['timestamp'].dt.weekday
        self.data['binned_hour'] = self.data['transaction_hour']//3
        self.data['timestamp_lag'] = self.data['timestamp'].shift(1)
        self.data['time_diff'] = self.data['timestamp']-self.data['timestamp_lag']
        self.data['time_diff'] = self.data['time_diff'].apply(lambda x: x.total_seconds())

        # Estimate average no. of transactions per day
        self.freq_per_day= np.int64(self.data.groupby(['day'])['day'].count().mean())
    
    def calculate_z_score(self, value: np.array , mean: np.float64, std: np.float64, 
                          att_name: str)-> tuple:
        '''
        Calculates z-score for the value
        using given mean, standard deviation
        '''
        z = (value - mean) / std
        stat=  f'Mean and Standard deviation for {att_name} are {mean}, {std} respectively.'
        return z, stat
    
    def calculate_rolling_stats(self, data: pd.Series, window: np.int64)-> tuple:
        '''
        Use sliding window to smoothen data and get approximate mean 
        and standard deviation over the time
        '''
        rolling_mean= data.rolling(window, min_periods= 1).mean()
        rolling_std= data.rolling(window, min_periods = 1).std()
        avg= np.nanmean(rolling_mean)
        std= rolling_std.values[-1]
        stat= f'The rolling avg and standard deviation of amount in last {window} transactions are {avg}, {std} respectively.'
        return ((avg, std), stat)
    
    def flag_freq_volume(self, data: pd.DataFrame, timestamp: datetime, hour_bin: np.int64, amount: np.float64):
        '''
        Function to claculate velocity of transaction
        to identify unusual frequent transaction of small amounts
        during specific hour segment in comaprision with hostoric data.
        '''

        # Group by day and binned hour to understand frequency, size of transaction 
        # each day during the target hour.
        velo_df= data.groupby(['day','binned_hour'], as_index= False).agg(avg_amount=('amount', 'mean'), avg_time_diff= ('time_diff', 'mean'))
        velo_df[['day', 'binned_hour']]= velo_df[['day', 'binned_hour']].astype('int32')

        # Time elapsed betweeen current and last transaction.
        time_delta= timestamp - data.loc[data.index.max(), 'timestamp']
        time_delta= time_delta.total_seconds()

        # Z-score for the time difference  
        time_avg= velo_df['avg_time_diff'].mean()
        time_std= velo_df['avg_time_diff'].std()
        Z_time, _= self.calculate_z_score(time_delta, time_avg, time_std, 'Time Differance')

        # Z-score for the amount 
        amount_avg= velo_df['avg_amount'].mean()
        amount_std= velo_df['avg_amount'].std()
        Z_amount, _= self.calculate_z_score(amount, amount_avg, amount_std, 'Time Differance Amount')

        # High frequency low amount transactions
        freq_violation = Z_time < -2 # high frequency
        amount_violation = Z_amount < -3 # low amount

        freq_vol_stats= f'Average & standard deviation for time differance and amount spent during binned hour {hour_bin} are\
                        ({time_avg}, {time_std}), ({amount_avg}, {amount_std}) respectively.'

        return ((freq_violation & amount_violation), freq_vol_stats)
    
    def calculate_stats(self, json_input):
        '''
        Main function to which the OCR data is passed, it
        further used other class methods in conjunction to 
        decide on normality of the transaction using statistics.
        '''
        try:
            total_amount= np.log(np.float64(json_input['amount']).item())
            timestamp= json_input.get('timestamp', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            timestamp= pd.to_datetime(timestamp)
            
            week= timestamp.week
            # month= timestamp.month
            # week_day= timestamp.day_of_week
            hour= timestamp.hour
            hour_bin= hour//3

            merchant= json_input.get('merchant', 'Unknown').lower()
            merchant_cat= json_input.get('merchant_category', 'Unknown').lower()
        
        except Exception as e:
            print(json_input)
            raise e

        window = self.freq_per_day

        # Rolling window stats for last 24 hours (136 previous data points), using week to get enough data
        prior_week= week
        week_data= pd.DataFrame()
        while week_data.shape[0] < window:
            week_data= self.data[self.data['week'] >= prior_week]
            prior_week= prior_week-1

        rolling_mean, rolling_std= self.calculate_rolling_stats(week_data['amount'], window= window)[0]

        # Hourly stats since past few weeks
        bin_hour_mean= week_data[(week_data['binned_hour'] == hour_bin)]['amount'].mean()
        bin_hour_std= week_data[(week_data['binned_hour'] == hour_bin)]['amount'].std()

        # Category Level stats
        merchant_cat_mean= week_data[(week_data['merchant_category'].str.lower() == merchant_cat)]['amount'].mean()
        merchant_cat_std= week_data[(week_data['merchant_category'].str.lower() == merchant_cat)]['amount'].std()

        # Merchant Level stats
        merchant_mean= week_data[(week_data['merchant'].str.lower() == merchant)]['amount'].mean()
        merchant_std= week_data[(week_data['merchant'].str.lower() == merchant)]['amount'].std()
        
        # # Hourly stats since past few weeks
        # bin_hour_mean= self.data[(self.data['binned_hour'] == hour_bin) & (self.data['week'] >= prior_week)]['amount'].mean()
        # bin_hour_std= self.data[(self.data['binned_hour'] == hour_bin) & (self.data['week'] >= prior_week)]['amount'].std()

        # # Category Level stats
        # merchant_cat_mean= self.data[(self.data['merchant_category'].str.lower() == merchant_cat) & (self.data['week'] >= prior_week)]['amount'].mean()
        # merchant_cat_std= self.data[(self.data['merchant_category'].str.lower() == merchant_cat) & (self.data['week'] >= prior_week)]['amount'].std()

        # # Merchant Level stats
        # merchant_mean= self.data[(self.data['merchant'].str.lower() == merchant) & (self.data['week'] >= prior_week)]['amount'].mean()
        # merchant_std= self.data[(self.data['merchant'].str.lower() == merchant) & (self.data['week'] >= prior_week)]['amount'].std()

        # Frequency Volume stats
        hig_freq_low_volume_flag, hig_freq_low_volume_stats = self.flag_freq_volume(week_data, timestamp, hour_bin, total_amount)

        z_rolling_amount = self.calculate_z_score(total_amount, rolling_mean, rolling_std, 'Z_Rolling_Amount')
        z_bin_hr_amount = self.calculate_z_score(total_amount, bin_hour_mean, bin_hour_std, 'Z_Bin_hour_Amount')
        z_merchant_cat_amount = self.calculate_z_score(total_amount, merchant_cat_mean, merchant_cat_std, 'Z_Merchant_Cat_Amount')
        z_merchant_amount = self.calculate_z_score(total_amount, merchant_mean, merchant_std, 'Z_Merchant_Amount')

        return ((z_rolling_amount, z_bin_hr_amount, z_merchant_cat_amount, z_merchant_amount), (hig_freq_low_volume_flag, hig_freq_low_volume_stats)) 
    
data_ = pd.read_csv('synthetic_transactions_v1.csv')

in_file= {
    'merchant': 'Amazon Prime',
    'merchant_category': 'Entertainment',
    'amount': 5.484718,
    'timestamp': '2024-10-30 23:19:29.399558+00:00'
}

obj= Data_Processor(data_)
stat= obj.calculate_stats(in_file)
stat

In [8]:
data_ = pd.read_csv('synthetic_transactions_v1.csv')

in_file= {
    'merchant': 'Amazon Prime',
    'merchant_category': 'Entertainment',
    'amount': 5.484718,
    'timestamp': '2024-10-30 23:19:29.399558+00:00'
}

obj= Data_Processor(data_)
stat= obj.calculate_stats(in_file)
stat

(((np.float64(-2.960851255431856),
   'Mean and Standard deviation for Z_Rolling_Amount are 8.315793414073323, 2.2337588632589216 respectively.'),
  (np.float64(-2.6082296764435124),
   'Mean and Standard deviation for Z_Bin_hour_Amount are 8.03898235841031, 2.4296237161100502 respectively.'),
  (np.float64(-2.961645294074422),
   'Mean and Standard deviation for Z_Merchant_Cat_Amount are 8.035465685040307, 2.1385072744028766 respectively.'),
  (np.float64(-0.8951269253266472),
   'Mean and Standard deviation for Z_Merchant_Amount are 6.307298740447541, 5.144893903516383 respectively.')),
 (np.True_,
  'Average & standard deviation for time differance and amount spent during binned hour 7 are                        (800.1326823314521, 315.1133412203902), (8.240697748588087, 0.48021174151273993) respectively.'))

In [None]:
!uvicorn :app --reload