# Image to Information
#### Extracting Named Entities using Conditional Random Fields


# Dataset
We will be using a publicly available dataset consisting of receipts. Open a file in the below folders to look at the structure of the OCR outputs and annotations for the dataset we're using for this exercise.

In [47]:
allOCRSet = "datasets/CleanOCR-all" # A folder with all OCR files
annotationSet = "datasets/Annotations-all" # A folder with all Annotation files

# Introduction to Optical Character Recognition (OCR)
Use Pytesseract to apply OCR on a sample receipt image.

In [48]:
import pytesseract
from PIL import Image

image_path = "datasets/SplitYB.png" # Receipt image from a restaurant

# Send receipt image through tesseract OCR
def getOCRText(image_path):
    image_data = Image.open(image_path)
    image_text = pytesseract.image_to_string(image_data)
    return image_text

print(getOCRText(image_path))

 

Split Yerba Buena

145 4th Street
San Francisco, CA 94103
Phone 415-296-8009

10/22/2018 1:35:32 PM
Order Id: AAAALGS4ACDE
33 - +++DINE HERE+++

2 VEGGIE BURGER (@10.95) $21.90
1 SONOMA $11.95
2 Sweet Potato Herb & Parmesan FREN $11.90
_ 2 Fresh Squeezed Lemonade (@2.95) $5.90

SF Employer Mandates $2.32
Sub Total $53.97
Sales Tax $4.59
Order Total $58 .56
American Express $58 .56

Card#: RRR 1000
Authorization: 573076

--> Order Closed <--

Order online at splitbread.com

 



# Text Labeling
Tokenize, Annotate and Tag a receipt for the total amount and merchant/business name

In [49]:
import pandas as pd
import os
import json

In [60]:
nerDataset = "datasets/ner-dataset.csv"
receiptNum = 0
def dataAnnotation():
	allCleanOCRFiles = os.listdir(allOCRSet) # Load all Clean OCR Files
	allIXFiles = os.listdir(annotationSet) # Load all Clean OCR Files
	outfile = open(nerDataset, 'w')
	global receiptNum
	receiptNum = 1
	outfile.write('Receipt #,Word,POS,Tag' + '\n')

	for ixfile in allIXFiles:
		filename = os.fsdecode(ixfile)
		for ocrfile in allCleanOCRFiles:
			ocrFilename = os.fsdecode(ocrfile)
			if filename == ocrFilename and filename != '.gitkeep':
				ixData = {}
				ocrCleanLine = ""
				with open(annotationSet + '/' + filename) as f1:
					ixData = json.load(f1)
				with open(allOCRSet + '/' + filename) as f2:
					ocrCleanLine = f2.readlines()
				annotationLine = ""
				wordNum = 0
				words = ocrCleanLine[0].replace(',', '').split(' ')
				isAmount = False
				isDate = False
				isMerchant = False
				beginMerchant = False
				beginDate = False

				merchantParts = ixData['company'].split(' ')
				dateParts = ixData['date'].split(' ')
				
				for word in words:
					annotationLine = ""
					word = word.replace(',', ' ')
					if wordNum == 0:
						if word == dateParts[0] and not isDate:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "DT" + ',' + 'B-date'
							isDate = True
							# Only expect a I-date if there are multiple parts to the date
							if len(dateParts) > 1:
								beginDate = True
							else:
								beginDate = False
						elif word == ixData['total'] and not isAmount:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "AM" + ',' + 'B-amt'
							isAmount = True
						elif word == merchantParts[0] and not isMerchant:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "MR" + ',' + 'B-mer'
							isMerchant = True
							beginMerchant = True
						else:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "NN" + ',' + 'O'
					else:
						#Date tagging
						if word == dateParts[0] and not isDate:
							annotationLine = ',' + word + ',' + "DT" + ',' + 'B-date'
							isDate = True
							# Only expect a I-date if there are multiple parts to the date
							if len(dateParts) > 1:
								# print("This date has many parts: " + ixData['date'])
								beginDate = True
							else:
								beginDate = False
						elif word in ixData['date'] and word != dateParts[0] and word != dateParts[-1] and beginDate:
							annotationLine = ',' + word + ',' + "DT" + ',' + 'I-date'
						elif word == dateParts[-1] and beginDate:
							beginDate = False
							annotationLine = ',' + word + ',' + "DT" + ',' + 'I-date'
						# Amount tagging
						elif word == ixData['total'] and not isAmount:
							annotationLine = ',' + word + ',' + "AM" + ',' + 'B-amt'
							isAmount = True

						# Merchant tagging
						elif word == merchantParts[0] and not isMerchant:
							beginMerchant = True
							isMerchant = True
							annotationLine = ',' + word + ',' + "MR" + ',' + 'B-mer'
						elif word in ixData['company'] and word != merchantParts[0] and word != merchantParts[-1] and beginMerchant:
							annotationLine = ',' + word + ',' + "MR" + ',' + 'I-mer'
						elif word == merchantParts[-1] and beginMerchant:
							beginMerchant = False
							annotationLine = ',' + word + ',' + "MR" + ',' + 'I-mer'

						#Other tagging
						else:
							annotationLine = ',' + word + ',' + "NN" + ',' + 'O'
					wordNum += 1						
					outfile.write(annotationLine + '\n')
				receiptNum += 1
	outfile.close
    
dataAnnotation()
print ('Annotations for %d files completed!' %receiptNum)

Annotations for 827 files completed!


# NER Model Training
Train a model using Conditional Random Fields algorithm with Named Entity Recognition concept to extract amount and merchant fields.

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.datasets import make_blobs

In [62]:
#Reading the csv file
df = pd.read_csv('datasets/ner-dataset.csv', encoding = "ISO-8859-1")

#Display first 10 rows
df.head(10)
df.describe()
#Displaying the unique Tags
df['Tag'].unique()
#Checking null values, if any.
df.isnull().sum()
df = df.fillna(method = 'ffill')

In [63]:
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Receipt #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Receipt: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None
        
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

#sentence with its pos and tag.
sent = getter.get_text()
# print(sent)
sentences = getter.sentences

## Feature Functions

In [64]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    for ele in s:  
        str1 += ele   
    
    # return string   
    return str1

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

## Train

In [65]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6)

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Test 

In [66]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

for i in range(len(y_pred)):
	prediction = y_pred[i]
	testList = X_test[i]
	testSentence = ""
	for testTuple in testList:
		testSentence = testSentence + testTuple['word.lower()'] + ' '
	words = testSentence.split(" ")
	x = 0
	for wordPrediction in prediction:
		if wordPrediction == 'B-date' or wordPrediction == 'B-amt' or wordPrediction == 'B-mer' or wordPrediction == 'I-mer' or wordPrediction == 'I-date':
			print(words[x],wordPrediction)
		x +=1

caldera B-mer
brewery I-mer
& I-mer
restaurant I-mer
12/28/2017 B-date
sanyu B-mer
stationery I-mer
shop I-mer
7.00 B-amt
11/12/2017 B-date
casa B-mer
chan I-mer
restaurant I-mer
21-dec-2014 B-date
moonstar B-mer
restaurant I-mer
37.37 B-amt
10/29/2018 B-date
one B-mer
three I-mer
seafood I-mer
restaurant I-mer
sdn I-mer
bhd I-mer
28-04-2018 B-date
65.70 B-amt
syarikat B-mer
perniagaan I-mer
gin I-mer
kee I-mer
12/01/2018 B-date
7.95 B-amt
argonaut B-mer
diner2000 I-mer
gardenia B-mer
bakeries I-mer
(kl) I-mer
sdn I-mer
bhd I-mer
21/08/2017 B-date
73.55 B-amt
hasha B-mer
petrokiosk I-mer
50.00 B-amt
hms B-mer
esquire I-mer
grille I-mer
apr04 B-date
16 I-date
aeon B-mer
co. I-mer
(m) I-mer
bhd I-mer
4.40 B-amt
12/04/2018 B-date
mr. B-mer
d.i.y. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
7.15 B-amt
15-07-18 B-date
wahin B-mer
hardware I-mer
sdn I-mer
bhd I-mer
04/12/17 B-date
42.40 B-amt
guardian B-mer
health I-mer
and I-mer
beauty I-mer
sdn I-mer
bhd I-mer
13.78 B-amt
07/07/17 B-date
teo B-mer

bhd I-mer
28.00 B-amt
26-03-18 B-date
one B-mer
three I-mer
seafood I-mer
restaurant I-mer
sdn I-mer
bhd I-mer
28-05-2018 B-date
112.35 B-amt
mr B-mer
d.i.y. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
30.90 B-amt
18-11-18 B-date
mr. B-mer
d.i.y. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
19-04-18 B-date
b B-mer
& I-mer
best I-mer
restaurant I-mer
20/03/2017 B-date
14.85 B-amt
landfall B-mer
restaurant I-mer
april I-mer
to I-mer
december I-mer
06/10/17 B-date
10 B-mer
gram I-mer
gourmet I-mer
sbn I-mer
bhd I-mer
11-06-2018 B-date
15.00 B-amt
99 B-mer
speed I-mer
mart I-mer
s/b I-mer
20-02-18 B-date
9.90 B-amt
ideal B-mer
menu I-mer
group I-mer
sdn I-mer
bhd I-mer
10/06/2018 B-date
12.00 B-amt
mizu B-mer
mentai I-mer
sdn. I-mer
bhd. I-mer
31-03-2018 B-date
65.55 B-amt
eco-shop B-mer
marketing I-mer
sdn I-mer
bhd I-mer
26/02/2018 B-date
16.30 B-amt
el B-mer
meson I-mer
mexican I-mer
restaurant I-mer
grotto B-mer
pizzeria I-mer
& I-mer
tavern I-mer
5/12/2017 B-date
perniagaan B-mer
zheng I-mer
hui I-mer

ikano B-mer
handel I-mer
sdn I-mer
bhd I-mer
27/07/17 B-date
538.00 B-amt
aik B-mer
huat I-mer
hardware I-mer
enterprise I-mer
(setia I-mer
alam) I-mer
sdn I-mer
bhd I-mer
10/05/2017 B-date
16.00 B-amt
gardenia B-mer
bakeries I-mer
(kl) I-mer
sdn I-mer
bhd I-mer
30/07/2017 B-date
20.21 B-amt
unihakka B-mer
international I-mer
sdn I-mer
bhd I-mer
09 B-date
apr I-date
2018 I-date
$8.20 B-amt
99 B-mer
speed I-mer
mart I-mer
s/b I-mer
05-02-18 B-date
141.50 B-amt
99 B-mer
speed I-mer
mart I-mer
s/b I-mer
14-12-16 B-date
61.65 B-amt
gerbang B-mer
alaf I-mer
restaurants I-mer
sdn I-mer
bhd I-mer
21/05/2018 B-date
70.30 B-amt
mr. B-mer
d.i.y. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
53.20 B-amt
28-04-18 B-date
one B-mer
three I-mer
seafood I-mer
restaurant I-mer
sdn I-mer
bhd I-mer
15-06-2018 B-date
148.50 B-amt
moonlight B-mer
cake I-mer
house I-mer
sdn I-mer
bhd I-mer
20/06/2018 B-date
rm14.20 B-amt
ocean B-mer
lc I-mer
packaging I-mer
enterprise I-mer
27/06/2018 B-date
rm152.00 B-amt
king's B-m

## Generate Report
Print the results of how your model was trained and tested to verify

In [67]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

report = flat_classification_report(y_test, y_pred)
print(report)

0.9995389380400328




              precision    recall  f1-score   support

       B-amt       1.00      1.00      1.00       379
      B-date       1.00      1.00      1.00       453
       B-mer       0.97      0.98      0.98       466
      I-date       1.00      1.00      1.00        82
       I-mer       0.99      0.99      0.99      1405
           O       1.00      1.00      1.00     44796

    accuracy                           1.00     47581
   macro avg       0.99      1.00      0.99     47581
weighted avg       1.00      1.00      1.00     47581



# Practical Application
Go back above to Text Labeling section and add the below code under #Date Tagging section. Continue with the steps to generate a model that now extracts 3 fields.

In [None]:
						# Date tagging
						if word == dateParts[0] and not isDate:
							annotationLine = ',' + word + ',' + "DT" + ',' + 'B-date'
							isDate = True
							# Only expect a I-date if there are multiple parts to the date
							if len(dateParts) > 1:
								# print("This date has many parts: " + ixData['date'])
								beginDate = True
							else:
								beginDate = False
						elif word in ixData['date'] and word != dateParts[0] and word != dateParts[-1] and beginDate:
							annotationLine = ',' + word + ',' + "DT" + ',' + 'I-date'
						elif word == dateParts[-1] and beginDate:
							beginDate = False
							annotationLine = ',' + word + ',' + "DT" + ',' + 'I-date'

## Test End to End by uploading a new image - OCR to NER

In [37]:
new_image_path = "datasets/lunch.png"
test_file = getOCRText(new_image_path)
print(test_file)

Bon Appetit
Oracle Redwood
Cafe 300

ES
rm CHK 2042
6/14/2018 | 1.05 PM
1! & g Salad, Large Ceasar 5 09
1 Side, Chicken 2.39

 

eeeERKHK( 03

VISA:087861

Credit Authorization 6,13
VISA $8.13
EKKKKKKKKKKD 4D

Subtota | $7.48
Tax $0.65
Payment $8.13

Change Due $O.00

Seeerterene Check Closed -----------
6/14/2018 1:05 PM

   



In [38]:
def get_test_sentence(test_file):
    #Reading the csv file
    trial = pd.read_csv(test_file, encoding = "ISO-8859-1")

    #Display first 10 rows
    trial.head(10)
    trial.describe()
    #Displaying the unique Tags
    trial['Tag'].unique()
    #Checking null values, if any.
    trial.isnull().sum()
    trial = trial.fillna(method = 'ffill')
    trial_getter = sentence(trial) # Check last token - seems to be repeating 
    trial_sentences = [" ".join([s[0] for s in sent]) for sent in trial_getter.sentences]
    trial_sentences[0]

    #sentence with its pos and tag.
    trial_sent = trial_getter.get_text()
    trial_sentences = trial_getter.sentences
    return trial_sentences

trial_sentences = get_test_sentence(test_file)

FileNotFoundError: [Errno 2] No such file or directory: 'Bon Appetit\nOracle Redwood\nCafe 300\n\nES\nrm CHK 2042\n6/14/2018 | 1.05 PM\n1! & g Salad, Large Ceasar 5 09\n1 Side, Chicken 2.39\n\n \n\neeeERKHK( 03\n\nVISA:087861\n\nCredit Authorization 6,13\nVISA $8.13\nEKKKKKKKKKKD 4D\n\nSubtota | $7.48\nTax $0.65\nPayment $8.13\n\nChange Due $O.00\n\nSeeerterene Check Closed -----------\n6/14/2018 1:05 PM\n\n   \n\x0c'

In [None]:
tX = [sent2features(s) for s in trial_sentences]
ty = [sent2labels(s) for s in trial_sentences]

#Predicting on the one test set.
X_test = tX
y_test = ty
y_pred = crf.predict(X_test)

f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

report = flat_classification_report(y_test, y_pred)
print(report)