<a href="https://colab.research.google.com/github/Bob-Gohardani/nlp-ml/blob/main/Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math
import warnings
from datetime import datetime
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
#########################
# Red Flags definition
#########################

# Red flag 1

# Computing distance between two strings. It is assumed that str1 is smaller than str2 in alphabetic order
# If str1 = some_stringXXXX and str2 = some_stringYYYY, where XXXX and YYYY are four digits numbers and string parts of
# str1 and str2 are the same, then distance() equals to YYYY - XXXX. Otherwise distance() is 10000

def distance(str1, str2):
    num1 = str1[-4:]

    if num1.isnumeric():
        num2 = str2[-4:]
        if num2.isnumeric():
            if str1[:-4] == str2[:-4]:
                d = int(num2) - int(num1)
            else:
                d = 10000
        else:
            d = 10000
    else:
        d = 10000
    return d

In [None]:
def flag1(df):
    # for each vendor the invoice numbers are sorted in ascending order
    df1 = df.groupby(['Vend#']).apply(lambda x: x.sort_values(["RefDoc"], ascending=True)).reset_index(
        drop=True)
    
    # defining new column 'RF1' which consists of differences of invoice numbers in "given" and "previous" row
    # defining auxiliary column 'distance' in which the distance is computed
    for idx in df1.index:
        if idx > 0:
            str1 = str(df1.at[idx - 1, 'RefDoc'])
            str2 = str(df1.at[idx, 'RefDoc'])
            df1.loc[idx, ['distance']] = distance(str1, str2)
            if df1.at[idx, 'distance'] == 1:
                df1.loc[idx - 1, ['RF1']] = 1
                df1.loc[idx, ['RF1']] = 1
        else:
            df1.loc[idx, ['distance']] = 10000

    # In above line we compute differences not only between invoice numbers within the same vendor, but we also compute
    # differences between the first invoice number of given vendor and the last invoice number of previous vendor.
    # Since such differences do not make any sense, we change them to 0.
    for acc in set(df1['Vend#']):
        ii = df1[df1['Vend#'] == acc].index[0]
        df1.loc[ii, ['distance']] = 0

    # 'density' is a sum of three consecutive values of 'RF1' - given one and two previous
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
    df1['density'] = df1['distance'].rolling(3).sum()

    # new column 'RF1_density' will equal 1 if numbers of FOUR consecutive invoice are in the interval of length 20,
    # otherwise it will be 0
    df1['RF1_density'] = 0
    # loop for every vendor
    for acc in set(df1['Vend#']):
        # take all indexes of df1 for given vendor
        indices = df1[df1['Vend#'] == acc].index
        # take the first index for given vendor
        ii = indices[0]
        
        for idx in range(ii + 2, ii + len(indices)):
            if df1.loc[idx, ['density']].item() < 21:
                df1.loc[idx - 2, ['RF1_density']] = 1
                df1.loc[idx - 1, ['RF1_density']] = 1
                df1.loc[idx, ['RF1_density']] = 1
                if idx > ii + 2:
                    df1.loc[idx - 3, ['RF1_density']] = 1
    df1.drop(['density'], axis=1, inplace=True)
    df1.drop(['distance'], axis=1, inplace=True)
    # finally 'RF1' become a 0-1 flag
    df1['RF1'] = df1['RF1'].apply(lambda x: 1 if x==1 else 0)
    return df1