In [1]:
def forward_pass_salary_identier(sal_df,non_sal_df,income_segment_id=None):
    """
    Performs salary identification (top down approach) for those accounts who have at least one is_salary=1 flag. 

    Arguments:
    -----------------------------------------------------------------
    sal_df: Salary transactions for pattern identification (DataFrame).
    non_sal_df: Non Salary Transaction of those accounts who have atleast one is_salary=1, except flagged month(DataFrame)
    income_segment_id: int, Income segmentation id.
    
    """
    
    # Get only those months for which salary has not been flagged and look for the salary transaction 
    
    df_all = non_sal_df.merge(sal_df, on=['account_number'], how='inner', indicator=True)
    df_all = df_all.rename(columns={'tran_date_x':'transaction_date'})
    
    
    
    # creating mask for filtering on the basis of amount and transaction date
    
    min_date_mask = df_all.loc[:,'transaction_date'] >= df_all.loc[:,'tran_date']
    max_date_mask = df_all.loc[:,'transaction_date'] <= df_all.loc[:,'forward_pass']
    min_amount_mask = df_all.loc[:,'amount_x'] >= df_all.loc[:,'min_amount_threshold_forward']
    max_amount_mask = df_all.loc[:,'amount_x']<= df_all.loc[:,'max_amount_threshold_forward']
    
    # applying mask for filtering on the basis of amount and transaction date
    
    df_all_amount_filtered =  df_all.loc[min_date_mask & max_date_mask & min_amount_mask & max_amount_mask]
    df_all_amount_filtered = df_all_amount_filtered.rename(columns={'amount':'prev_amount','month_start_x':'month_start'})
    df_all_amount_filtered = df_all_amount_filtered.assign(amount_changed = lambda x: abs(x['prev_amount']-x['amount_x']) / x['prev_amount'])

    # creating mask for within day range filter
    
    min_day_mask = df_all_amount_filtered.loc[:,'day_x']>=df_all_amount_filtered.loc[:,'min_day_threshold_forward']
    max_day_mask = df_all_amount_filtered.loc[:,'day_x']<=df_all_amount_filtered.loc[:,'max_day_threshold_forward']
    
    
    # applying mask for within day range filter
    
    df_day_window_filtered = df_all_amount_filtered.loc[min_day_mask & max_day_mask]
    
    # grouping to get min amount changed of the transaction for the same month
    
    df_day_window_filtered_grouped = df_day_window_filtered.groupby(['account_number','month_start']).agg(amount_changed=('amount_changed','min')).reset_index()    
    df_day_window_amt_diff_filtered = df_day_window_filtered.merge(df_day_window_filtered_grouped, on=['account_number','month_start','amount_changed'], how='inner')
    df_day_window_amt_diff_filtered.sort_values(by=["account_number", "transaction_date"],ascending=True, inplace=True)
    df_day_window_amt_diff_filtered.drop_duplicates(subset=["account_number", "month_start"],keep="first", inplace=True)
    
    
    # assigning forward pattern for within day range transaction
    
    df_forward_pattern = df_day_window_amt_diff_filtered.loc[:,['tran_id_x','account_number','transaction_date','amount_x','month_start']]
    df_forward_pattern['pattern_flag']=2
    
    df_forward_pattern = df_forward_pattern.rename(columns = {'amount_x':'amount','tran_id_x':'transaction_id'})
    
    
    # looking for out out day range transactions (pattern 3)
    
    df_forward_pattern_3 = df_all_amount_filtered.merge(df_forward_pattern, on=['account_number','month_start'],how='left',indicator='exist')
    df_forward_pattern_3 = df_forward_pattern_3.loc[df_forward_pattern_3['exist'] == 'left_only']
    df_forward_pattern_3.sort_values(by=["account_number", "month_start","amount_changed"],ascending=True, inplace=True)
    df_forward_pattern_3.drop_duplicates(subset=["account_number", "month_start"],keep="first", inplace=True)
    df_forward_pattern_3 = df_forward_pattern_3.loc[:,['tran_id_x','account_number','transaction_date_x','amount_x','month_start']]
    df_forward_pattern_3 = df_forward_pattern_3.rename(columns={'tran_id_x':'tran_id','transaction_date_x':'tran_date','amount_x':'amount'})
    df_forward_pattern_3['pattern_flag']=3

    df_forward_pattern = df_forward_pattern.rename(columns = {'transaction_date':'tran_date','transaction_id':'tran_id'})
    
    
    
    df_forward_pattern = pd.concat([df_forward_pattern_3,df_forward_pattern])

    
    return df_forward_pattern


    
    
    

In [2]:
def create_salary_pattern(sal_known, conf_fwd_min_amount_range, conf_fwd_max_amount_range, conf_bkd_min_amount_range,conf_bkd_max_amount_range,
                         conf_fwd_min_window, conf_fwd_max_window, conf_bkd_min_window, conf_bkd_max_window,
                         max_tran_date, min_tran_date):
    """
    Creates a salary identification pattern for forward and backward pass 

    Arguments:
    -----------------------------------------------------------------
    sal_known: Salary transactions where is_salary=1.
    conf_fwd_min_amount_range: minimum percentage range for amount in forward pass
    conf_fwd_max_amount_range: maximum percentage range for amount in forward pass
    conf_bkd_min_amount_range: minimum percentage range for amount in backward pass
    conf_bkd_max_amount_range: maximum percentage range for amount in backward pass
    conf_fwd_min_window: min day window for forward pass
    conf_fwd_max_window: max day window for forward pass
    conf_bkd_min_window: min day window for backward pass
    conf_bkd_max_window: max day window for backward pass
    max_tran_date: maximum transaction date in the entire transaction base table
    min_tran_date: minimum transaction date in the entire transaction base table

   
    """
    # creating an empty dataframe to work with
    sal_rec = pd.DataFrame(columns=['tran_id','account_number','tran_date','is_salary','amount','month_start'])
    sal_rec = sal_known.copy()
    
    
    sal_rec.loc[:,'tran_date'] = pd.to_datetime(sal_rec.loc[:,'tran_date'])
    sal_rec.loc[:,'day'] = sal_rec.loc[:,'tran_date'].dt.day
    sal_rec.loc[:,'month_end'] = pd.to_datetime(sal_rec.loc[:,'tran_date'])+pd.offsets.MonthEnd(n=0)
    sal_rec.loc[:,'month_end_day'] = sal_rec.loc[:,'month_end'].dt.day
    sal_rec.loc[:,'month'] = sal_rec.loc[:,'tran_date'].dt.month
    
    
    sal_rec = sal_rec.sort_values(by=["account_number", "tran_date"],ascending=True)
    sal_rec['forward_pass']=sal_rec.groupby(['account_number'])['tran_date'].shift(-1).fillna(max_tran_date)
    sal_rec['backward_pass']=sal_rec.groupby(['account_number'])['tran_date'].shift(1).fillna(min_tran_date)
    sal_rec.reset_index(drop=True)
    
    # setting threshold for amount (works for both forward and backward pass)

    sal_rec['min_amount_threshold_forward'] = conf_fwd_min_amount_range * sal_rec['amount'] 
    sal_rec['max_amount_threshold_forward'] = conf_fwd_max_amount_range * sal_rec['amount'] 

    sal_rec['min_amount_threshold_backward'] = conf_bkd_min_amount_range * sal_rec['amount'] 
    sal_rec['max_amount_threshold_backward'] = conf_bkd_max_amount_range * sal_rec['amount'] 


    # setting threshold for day range

    sal_rec['constant_one']=1
    
    sal_rec['min_day_window_forward'] = sal_rec['day'] - conf_fwd_min_window 
    sal_rec['max_day_window_forward'] = sal_rec['day'] + conf_fwd_max_window 

    sal_rec['min_day_window_backward'] = sal_rec['day'] - conf_bkd_min_window 
    sal_rec['max_day_window_backward'] = sal_rec['day'] + conf_bkd_max_window 
    sal_rec.reset_index(drop=True)
    
    # get min and max window for forward pass
    
    sal_rec['min_day_threshold_forward'] = sal_rec[['constant_one','min_day_window_forward']].max(axis=1)
    sal_rec['max_day_threshold_forward'] = sal_rec[['max_day_window_forward','month_end_day']].min(axis=1)
    
     # get min and max window for backward pass

    sal_rec['min_day_threshold_backward'] = sal_rec[['constant_one','min_day_window_backward']].max(axis=1)
    sal_rec['max_day_threshold_backward'] = sal_rec[['max_day_window_backward','month_end_day']].min(axis=1)
    sal_rec = sal_rec.reset_index(drop=True)
    
    
    return sal_rec

    
    

In [3]:
# notebook imports

import numpy as np
import pandas as pd
from pandas.tseries.offsets import MonthEnd


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
tran_id1 = pd.Series(np.arange(27)).astype(int)
account_number1 = pd.Series(['abc']*27)
date1 = pd.Series(['2020-10-27','2020-11-03','2020-12-05','2020-12-15','2021-01-01','2021-01-02','2021-01-3','2021-1-30','2021-02-04','2021-02-04','2021-02-05','2021-02-08','2021-02-11','2021-03-04','2021-04-01',"2021-04-04","2021-04-30","2021-05-03","2021-06-05","2021-06-08","2021-06-30","2021-07-07","2021-07-15","2021-08-03","2021-09-09","2021-09-10","2021-10-09"])
is_salary1 = pd.Series([0,0,0,0,1, 0, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0])
amount1 = pd.Series([9500,8500,9500,10000,11000,12000,200,11000,11000,11500,11000,110,16000,17500,17000,30000,18700,18000,17000,500,18000,17800,30000,120,18000,17800,600000])

In [6]:
tran_id2 = pd.Series(np.arange(27,53,1)).astype(int)
account_number2 = pd.Series(['xyz']*26)
date2 = pd.Series(['2020-11-27','2020-11-03','2020-12-05','2020-12-15','2021-01-01','2021-01-02','2021-01-3','2021-1-30','2021-02-04','2021-02-04','2021-02-05','2021-02-08','2021-02-11','2021-03-04','2021-04-01',"2021-04-04","2021-04-30","2021-05-03","2021-06-05","2021-06-08","2021-06-30","2021-07-07","2021-07-15","2021-08-03","2021-09-09","2021-10-09"])
is_salary2 = pd.Series([0,0,0,0,1, 0, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0])
amount2 = pd.Series([5500,7500,9500,10000,11000,12000,200,11000,11000,11500,11000,110,16000,17500,17000,30000,18700,18000,17000,500,18000,17800,30000,120,18000,600000])

In [7]:
df1 = pd.concat([tran_id1,account_number1,date1,is_salary1,amount1],axis=1)
df2 = pd.concat([tran_id2,account_number2,date2,is_salary2,amount2],axis=1)
df = pd.concat([df1,df2])
df.columns = df.columns = ['tran_id','account_number','tran_date','is_salary','amount']
df['tran_date'] = pd.to_datetime(df['tran_date'])

In [8]:
max_tran_date = df.tran_date.max()
min_tran_date = df.tran_date.min()
print('max tran date is: ',max_tran_date)
print('min tran date is: ',min_tran_date)


max tran date is:  2021-10-09 00:00:00
min tran date is:  2020-10-27 00:00:00


In [9]:
df['month_start'] = df['tran_date'].apply(lambda x: x.replace(day=1))

In [10]:
# get salary flagged transaction (where is_salary = 1)

In [11]:
sal_rec = df.loc[df.is_salary==1]
sal_rec

Unnamed: 0,tran_id,account_number,tran_date,is_salary,amount,month_start
4,4,abc,2021-01-01,1,11000,2021-01-01
13,13,abc,2021-03-04,1,17500,2021-03-01
17,17,abc,2021-05-03,1,18000,2021-05-01
4,31,xyz,2021-01-01,1,11000,2021-01-01
13,40,xyz,2021-03-04,1,17500,2021-03-01
17,44,xyz,2021-05-03,1,18000,2021-05-01


In [12]:
sal_rec_final = create_salary_pattern(sal_rec, 0.9, 1.3, 0.7,1.1,
                         4, 4, 4, 4,
                         max_tran_date, min_tran_date)

In [13]:
sal_rec_final

Unnamed: 0,tran_id,account_number,tran_date,is_salary,amount,month_start,day,month_end,month_end_day,month,forward_pass,backward_pass,min_amount_threshold_forward,max_amount_threshold_forward,min_amount_threshold_backward,max_amount_threshold_backward,constant_one,min_day_window_forward,max_day_window_forward,min_day_window_backward,max_day_window_backward,min_day_threshold_forward,max_day_threshold_forward,min_day_threshold_backward,max_day_threshold_backward
0,4,abc,2021-01-01,1,11000,2021-01-01,1,2021-01-31,31,1,2021-03-04,2020-10-27,9900.0,14300.0,7700.0,12100.0,1,-3,5,-3,5,1,5,1,5
1,13,abc,2021-03-04,1,17500,2021-03-01,4,2021-03-31,31,3,2021-05-03,2021-01-01,15750.0,22750.0,12250.0,19250.0,1,0,8,0,8,1,8,1,8
2,17,abc,2021-05-03,1,18000,2021-05-01,3,2021-05-31,31,5,2021-10-09,2021-03-04,16200.0,23400.0,12600.0,19800.0,1,-1,7,-1,7,1,7,1,7
3,31,xyz,2021-01-01,1,11000,2021-01-01,1,2021-01-31,31,1,2021-03-04,2020-10-27,9900.0,14300.0,7700.0,12100.0,1,-3,5,-3,5,1,5,1,5
4,40,xyz,2021-03-04,1,17500,2021-03-01,4,2021-03-31,31,3,2021-05-03,2021-01-01,15750.0,22750.0,12250.0,19250.0,1,0,8,0,8,1,8,1,8
5,44,xyz,2021-05-03,1,18000,2021-05-01,3,2021-05-31,31,5,2021-10-09,2021-03-04,16200.0,23400.0,12600.0,19800.0,1,-1,7,-1,7,1,7,1,7


In [14]:
## non_salary transaction of salary accounts

non_sal_rec = df.loc[df.is_salary==0]
non_sal_rec.loc[:,'month_end'] = pd.to_datetime(non_sal_rec['tran_date'])+pd.offsets.MonthEnd(n=0)
non_sal_rec.loc[:,'month_end_day'] = non_sal_rec['month_end'].dt.day
non_sal_rec.loc[:,'day'] = non_sal_rec.loc[:,'tran_date'].dt.day
non_sal_rec.loc[:,'month'] = non_sal_rec.loc[:,'tran_date'].dt.month
non_sal_rec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,tran_id,account_number,tran_date,is_salary,amount,month_start,month_end,month_end_day,day,month
0,0,abc,2020-10-27,0,9500,2020-10-01,2020-10-31,31,27,10
1,1,abc,2020-11-03,0,8500,2020-11-01,2020-11-30,30,3,11
2,2,abc,2020-12-05,0,9500,2020-12-01,2020-12-31,31,5,12
3,3,abc,2020-12-15,0,10000,2020-12-01,2020-12-31,31,15,12
5,5,abc,2021-01-02,0,12000,2021-01-01,2021-01-31,31,2,1
6,6,abc,2021-01-03,0,200,2021-01-01,2021-01-31,31,3,1
7,7,abc,2021-01-30,0,11000,2021-01-01,2021-01-31,31,30,1
8,8,abc,2021-02-04,0,11000,2021-02-01,2021-02-28,28,4,2
9,9,abc,2021-02-04,0,11500,2021-02-01,2021-02-28,28,4,2
10,10,abc,2021-02-05,0,11000,2021-02-01,2021-02-28,28,5,2


In [15]:
df_all = non_sal_rec.merge(sal_rec_final, on=['account_number','month_start'], how='left', indicator=True)
df_all

Unnamed: 0,tran_id_x,account_number,tran_date_x,is_salary_x,amount_x,month_start,month_end_x,month_end_day_x,day_x,month_x,tran_id_y,tran_date_y,is_salary_y,amount_y,day_y,month_end_y,month_end_day_y,month_y,forward_pass,backward_pass,min_amount_threshold_forward,max_amount_threshold_forward,min_amount_threshold_backward,max_amount_threshold_backward,constant_one,min_day_window_forward,max_day_window_forward,min_day_window_backward,max_day_window_backward,min_day_threshold_forward,max_day_threshold_forward,min_day_threshold_backward,max_day_threshold_backward,_merge
0,0,abc,2020-10-27,0,9500,2020-10-01,2020-10-31,31,27,10,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only
1,1,abc,2020-11-03,0,8500,2020-11-01,2020-11-30,30,3,11,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only
2,2,abc,2020-12-05,0,9500,2020-12-01,2020-12-31,31,5,12,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only
3,3,abc,2020-12-15,0,10000,2020-12-01,2020-12-31,31,15,12,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only
4,5,abc,2021-01-02,0,12000,2021-01-01,2021-01-31,31,2,1,4.0,2021-01-01,1.0,11000.0,1.0,2021-01-31,31.0,1.0,2021-03-04,2020-10-27,9900.0,14300.0,7700.0,12100.0,1.0,-3.0,5.0,-3.0,5.0,1.0,5.0,1.0,5.0,both
5,6,abc,2021-01-03,0,200,2021-01-01,2021-01-31,31,3,1,4.0,2021-01-01,1.0,11000.0,1.0,2021-01-31,31.0,1.0,2021-03-04,2020-10-27,9900.0,14300.0,7700.0,12100.0,1.0,-3.0,5.0,-3.0,5.0,1.0,5.0,1.0,5.0,both
6,7,abc,2021-01-30,0,11000,2021-01-01,2021-01-31,31,30,1,4.0,2021-01-01,1.0,11000.0,1.0,2021-01-31,31.0,1.0,2021-03-04,2020-10-27,9900.0,14300.0,7700.0,12100.0,1.0,-3.0,5.0,-3.0,5.0,1.0,5.0,1.0,5.0,both
7,8,abc,2021-02-04,0,11000,2021-02-01,2021-02-28,28,4,2,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only
8,9,abc,2021-02-04,0,11500,2021-02-01,2021-02-28,28,4,2,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only
9,10,abc,2021-02-05,0,11000,2021-02-01,2021-02-28,28,5,2,,NaT,,,,NaT,,,NaT,NaT,,,,,,,,,,,,,,left_only


In [16]:
# getting only those months for which salry has not been flagged

In [17]:
non_sal_rem = df_all.loc[df_all['_merge'] == 'left_only']
non_sal_rem.drop(['tran_date_y','amount_y','_merge','month_end_day_y','month_end_y','day_y','month_y','is_salary_y'],axis=1,inplace=True)
# non_sal_rem.sample(10)
non_sal_rem.drop(['forward_pass','backward_pass','min_amount_threshold_forward','max_amount_threshold_forward','min_amount_threshold_backward','max_amount_threshold_backward','constant_one','min_day_window_forward','max_day_window_forward','max_day_window_backward','min_day_window_backward','min_day_threshold_forward','max_day_threshold_forward','min_day_threshold_backward','max_day_threshold_backward','tran_id_y'],axis=1,inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [18]:
non_sal_rem

Unnamed: 0,tran_id_x,account_number,tran_date_x,is_salary_x,amount_x,month_start,month_end_x,month_end_day_x,day_x,month_x
0,0,abc,2020-10-27,0,9500,2020-10-01,2020-10-31,31,27,10
1,1,abc,2020-11-03,0,8500,2020-11-01,2020-11-30,30,3,11
2,2,abc,2020-12-05,0,9500,2020-12-01,2020-12-31,31,5,12
3,3,abc,2020-12-15,0,10000,2020-12-01,2020-12-31,31,15,12
7,8,abc,2021-02-04,0,11000,2021-02-01,2021-02-28,28,4,2
8,9,abc,2021-02-04,0,11500,2021-02-01,2021-02-28,28,4,2
9,10,abc,2021-02-05,0,11000,2021-02-01,2021-02-28,28,5,2
10,11,abc,2021-02-08,0,110,2021-02-01,2021-02-28,28,8,2
11,12,abc,2021-02-11,0,16000,2021-02-01,2021-02-28,28,11,2
12,14,abc,2021-04-01,0,17000,2021-04-01,2021-04-30,30,1,4


In [19]:
# df_forward_pattern = forward_pass_salary_identier(sal_rec_final,non_sal_rem)

In [20]:
df_forward_pattern = forward_pass_salary_identier(sal_rec_final,non_sal_rem)
df_forward_pattern

Unnamed: 0,tran_id,account_number,tran_date,amount,month_start,pattern_flag
8,24,abc,2021-09-09,18000,2021-09-01,3
18,51,xyz,2021-09-09,18000,2021-09-01,3
0,8,abc,2021-02-04,11000,2021-02-01,2
2,14,abc,2021-04-01,17000,2021-04-01,2
3,18,abc,2021-06-05,17000,2021-06-01,2
4,21,abc,2021-07-07,17800,2021-07-01,2
5,35,xyz,2021-02-04,11000,2021-02-01,2
7,41,xyz,2021-04-01,17000,2021-04-01,2
8,45,xyz,2021-06-05,17000,2021-06-01,2
9,48,xyz,2021-07-07,17800,2021-07-01,2


In [21]:
## create module for backward pass

In [22]:
def backward_pass_salary_identier(foward_pattern,non_sal_df,sal_df,income_segment_id=None):
    """
    Performs salary identification (bottom up approach) for those accounts who have at least one is_salary=1 flag. 

    Arguments:
    -----------------------------------------------------------------
    forward_pattern: Salary pattern identified after forward pass (DataFrame) [tran_id,account_number,tran_date,amount,month_start,pattern_flag].
    non_sal_df: Non Salary Transaction of those accounts who have atleast one is_salary=1, except flagged month(DataFrame)
    sal_df: Salary transactions for pattern identification (DataFrame).
    income_segment_id: int, Income segmentation id.
    
    """
    
    # select only those months for which forward pass could not find the salary pattern

    non_sal_rem_backward = non_sal_df.merge(foward_pattern, on=['account_number','month_start'],how='left',indicator=True)
    non_sal_rem_backward = non_sal_rem_backward.loc[non_sal_rem_backward['_merge'] == 'left_only']
    
    # taking subset from non_sal_rem_backward
    non_sal_rem_backward = non_sal_rem_backward.loc[:,['tran_id_x','account_number','tran_date_x','amount_x','month_start','month_end_day_x','day_x']]

    # renaming columns
    non_sal_rem_backward = non_sal_rem_backward.rename(columns = {'tran_id_x':'tran_id','tran_date_x':'tran_date','amount_x':'amount','month_end_day_x':'month_end_day','day_x':'day'})
    
    
    df_all = non_sal_rem_backward.merge(sal_df, on=['account_number'], how='inner', indicator=True)
    df_all = df_all.rename(columns={'tran_date_x':'transaction_date','tran_id_x':'transaction_id'})
    
    # creating mask for filtering on the basis of amount and transaction date
    
    min_date_mask = df_all.loc[:,'transaction_date'] >= df_all.loc[:,'backward_pass']
    max_date_mask = df_all.loc[:,'transaction_date'] <= df_all.loc[:,'tran_date_y']
    min_amount_mask = df_all.loc[:,'amount_x'] >= df_all.loc[:,'min_amount_threshold_backward']
    max_amount_mask = df_all.loc[:,'amount_x']<= df_all.loc[:,'max_amount_threshold_backward']
    
    # applying mask for filtering on the basis of amount and transaction date (this will help to diffrentiate pattern 2 from pattern 4)
    
    df_all_amount_filtered =  df_all.loc[min_date_mask & max_date_mask & min_amount_mask & max_amount_mask]
    df_all_amount_filtered = df_all_amount_filtered.rename(columns={'amount_y':'prev_amount','month_start_x':'month_start'})
    df_all_amount_filtered = df_all_amount_filtered.assign(amount_changed = lambda x: abs(x['prev_amount']-x['amount_x']) / x['prev_amount'])
    
    # creating mask for within day range filter
    min_day_mask = df_all_amount_filtered.loc[:,'day_x']>=df_all_amount_filtered.loc[:,'min_day_threshold_backward']
    max_day_mask = df_all_amount_filtered.loc[:,'day_x']<=df_all_amount_filtered.loc[:,'max_day_threshold_backward']
    
    
    # applying mask for within day filter
    df_day_window_filtered = df_all_amount_filtered.loc[min_day_mask & max_day_mask]
    
    # grouping and filtering to get minimum amount change
    df_day_window_filtered_grouped = df_day_window_filtered.groupby(['account_number','month_start']).agg(amount_changed=('amount_changed','min')).reset_index()
    df_day_window_amt_diff_filtered = df_day_window_filtered.merge(df_day_window_filtered_grouped, on=['account_number','month_start','amount_changed'], how='inner')
    # if same transaction in same month select transaciton date that is minimum
    df_day_window_amt_diff_filtered.sort_values(by=["account_number", "transaction_date"],ascending=True, inplace=True)
    df_day_window_amt_diff_filtered.drop_duplicates(subset=["account_number", "month_start"],keep="first", inplace=True)
    
    
    # assigning pattern 2 for those accounts for which transaction lies within day
    df_backward_pattern = df_day_window_amt_diff_filtered.loc[:,['transaction_id','account_number','transaction_date','amount_x','month_start']]
    df_backward_pattern['pattern_flag']=2
    df_backward_pattern = df_backward_pattern.rename(columns = {'amount_x':'amount'})
    
    # assign pattern 3 for those account transaction that lies out of day range window
    
    df_backward_pattern_3 = df_all_amount_filtered.merge(df_backward_pattern, on=['account_number','month_start'],how='left',indicator='exist')
    df_backward_pattern_3 = df_backward_pattern_3.loc[df_backward_pattern_3['exist'] == 'left_only']
    
    # if multiple amounts in same month, select minimum change in amount
    # to handle: if same amount in same month select one (takes one randomly)
    df_backward_pattern_3.sort_values(by=["account_number", "month_start","amount_changed"],ascending=True, inplace=True)
    df_backward_pattern_3.drop_duplicates(subset=["account_number", "month_start"],keep="first", inplace=True)
    df_backward_pattern_3 = df_backward_pattern_3.loc[:,['transaction_id_x','account_number','transaction_date_x','amount_x','month_start']]
    df_backward_pattern_3 = df_backward_pattern_3.rename(columns={'transaction_id_x':'tran_id','transaction_date_x':'tran_date','amount_x':'amount'})
    df_backward_pattern_3['pattern_flag']=3
    
    df_backward_pattern = df_backward_pattern.rename(columns = {'transaction_date':'tran_date','transaction_id':'tran_id'})
    
    # concat/union of pattern 2 and pattern 3
    df_backward_pattern = pd.concat([df_backward_pattern_3,df_backward_pattern])
    
    return df_backward_pattern


    

In [23]:
df_backward_pattern = backward_pass_salary_identier(df_forward_pattern,non_sal_rem,sal_rec_final)
df_backward_pattern

Unnamed: 0,tran_id,account_number,tran_date,amount,month_start,pattern_flag
0,0,abc,2020-10-27,9500,2020-10-01,3
0,1,abc,2020-11-03,8500,2020-11-01,2
1,2,abc,2020-12-05,9500,2020-12-01,2
2,29,xyz,2020-12-05,9500,2020-12-01,2


In [24]:
# we can take tran_id, account_number and pattern_flag adn append in df_flag

In [27]:
df_forward_backward = pd.concat([df_forward_pattern,df_backward_pattern]).reset_index(drop=True)
df_forward_backward

Unnamed: 0,tran_id,account_number,tran_date,amount,month_start,pattern_flag
0,24,abc,2021-09-09,18000,2021-09-01,3
1,51,xyz,2021-09-09,18000,2021-09-01,3
2,8,abc,2021-02-04,11000,2021-02-01,2
3,14,abc,2021-04-01,17000,2021-04-01,2
4,18,abc,2021-06-05,17000,2021-06-01,2
5,21,abc,2021-07-07,17800,2021-07-01,2
6,35,xyz,2021-02-04,11000,2021-02-01,2
7,41,xyz,2021-04-01,17000,2021-04-01,2
8,45,xyz,2021-06-05,17000,2021-06-01,2
9,48,xyz,2021-07-07,17800,2021-07-01,2


In [26]:
def salary_pattern_identifier():
    pass