In [1]:
import pandas as pd
import numpy as np
import warnings
import re
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,transaction_descriptor,store_number,dataset
0,DOLRTREE 2257 00022574 ROSWELL,2257,train
1,AUTOZONE #3547,3547,train
2,TGI FRIDAYS 1485 0000,1485,train
3,BUFFALO WILD WINGS 003,3,train
4,J. CREW #568 0,568,train
...,...,...,...
295,MCDONALD'S F2151,F2151,test
296,NST BEST BUY #1403 332411,1403,test
297,CVS/PHARMACY #06689,6689,test
298,BANANA REPUBLIC #8109,8109,test


In [3]:
#Two main types of transaction_descriptor
print('Include #:',df[df['transaction_descriptor'].str.contains('#')].shape)
print('Exclude #:',df[~df['transaction_descriptor'].str.contains('#')].shape)

Include #: (159, 3)
Exclude #: (141, 3)


In [4]:
#Special brands have letter in store_number
df[(df['transaction_descriptor'].str.contains("MCDONALD'S"))\
    | (df['transaction_descriptor'].str.contains("KFC"))\
        | (df['transaction_descriptor'].str.contains("PAPA MURPHY'S"))]

Unnamed: 0,transaction_descriptor,store_number,dataset
17,MCDONALD'S F1013,F1013,train
20,MCDONALD'S F16829,F16829,train
29,MCDONALD'S F5579,F5579,train
37,MCDONALD'S F2128,F2128,train
40,MCDONALD'S F3172,F3172,train
74,MCDONALD'S F26490,F26490,train
90,MCDONALD'S F35869MCDONALD'S F35869,F35869,train
94,KFC G020029,G020029,train
104,MCDONALD'S F565 CLARKSVILLE TN,F565,validation
137,MCDONALD'S F6297,F6297,validation


In [5]:
#Split dataset
train = df[df['dataset']=='train']
val = df[df['dataset']=='validation']
test = df[df['dataset']=='test']

In [6]:
val

Unnamed: 0,transaction_descriptor,store_number,dataset
100,DEL TACO 833,833,validation
101,NNT BURLNGTON STORE472605,472605,validation
102,WENDY'S #05320,5320,validation
103,DUNKIN #337734 Q35,337734,validation
104,MCDONALD'S F565 CLARKSVILLE TN,F565,validation
...,...,...,...
195,ROSS STORES #15,15,validation
196,SPRINT STORE #346,346,validation
197,SPEEDWAY 07134 4343 OL,7134,validation
198,THE HOME DEPOT #8550,8550,validation


In [7]:
test

Unnamed: 0,transaction_descriptor,store_number,dataset
200,IN-N-OUT BURGER #242,242,test
201,BP#9442088LIBERTYVILLE B,9442088,test
202,JCPENNEY 1419,1419,test
203,ROSS STORES #1019,1019,test
204,WM SUPERCENTER #38,38,test
...,...,...,...
295,MCDONALD'S F2151,F2151,test
296,NST BEST BUY #1403 332411,1403,test
297,CVS/PHARMACY #06689,6689,test
298,BANANA REPUBLIC #8109,8109,test


In [8]:
#Create metric function
def matching_accuracy(y_true,y_pred):
    pop = len(y_true)
    true_pred = np.where(y_true==y_pred,1,0).sum()
    
    return f'{np.round(true_pred/pop*100,2)}%'

In [9]:
#Model
def get_id_hash(num):
    num = num.split('#')[1]
    
    return str(int(re.findall('[0-9]+', num)[0]))

def get_id_else(num):
    if ("MCDONALD'S" in num) or ("KFC" in num) or ("PAPA MURPHY'S" in num):
        return re.findall('[A-Z]+[0-9]+', num)[0]
    if len(re.findall('[0-9]+',num)) >= 2:
        if len(re.findall('[0-9]+[^A-Za-z0-9 ]+[A-Za-z]+',num)) > 0:
            num = num.replace(re.findall('[0-9]+[^A-Za-z0-9 ]+[A-Z]+',num)[0],'')
    num_room = re.findall('[0-9]+', num)
    len_num_room = [len(i) for i in num_room]
    if len(str(int(re.findall('[0-9]+', num)[0]))) == sorted(len_num_room,reverse=True)[0]:
        num = str(int(re.findall('[0-9]+', num)[0]))
    else:
        idx_max = np.argmax(len_num_room)
        num = str(int(re.findall('[0-9]+', num)[idx_max]))
    return num

def get_id_full(num):
    if '#' in num:
        return get_id_hash(num)
    return get_id_else(num)

In [10]:
#Model perform in train set
train['predicted'] = train['transaction_descriptor'].apply(get_id_full)
print(matching_accuracy(train['store_number'],train['predicted']))
train[train['store_number']!=train['predicted']]

96.0%


Unnamed: 0,transaction_descriptor,store_number,dataset,predicted
0,DOLRTREE 2257 00022574 ROSWELL,2257,train,22574
48,NNT LANE BRYANT 455460,5546,train,455460
65,NNT SEARS HOMETOWN 231594,2315,train,231594
99,RACETRAC485 00004853,485,train,4853


In [11]:
#Model perform in validation set
val['predicted'] = val['transaction_descriptor'].apply(get_id_full)
print(matching_accuracy(val['store_number'],val['predicted']))
val[val['store_number']!=val['predicted']]

96.0%


Unnamed: 0,transaction_descriptor,store_number,dataset,predicted
119,PIER 1 IMPORTS00008060,806,validation,8060
146,NNT SEARS HOMETOWN 000415,4,validation,415
160,OCHARLEYS206BOWLGR 000000,206,validation,0
167,SUBWAY 00040055,4005,validation,40055


In [12]:
#Model perform in test set
test['predicted'] = test['transaction_descriptor'].apply(get_id_full)
print(matching_accuracy(test['store_number'],test['predicted']))
test[test['store_number']!=test['predicted']]

96.0%


Unnamed: 0,transaction_descriptor,store_number,dataset,predicted
223,NNT POLO/RL WRENTHA130571,13057,test,130571
231,NNT SEARS HOMETOWN 862751,8627,test,862751
236,CASEYS GEN STORE 2597 SLOAN IA51055,2597,test,51055
292,SUBWAY 00032128,3212,test,32128
