# Image to Information
#### Extracting Named Entities using Conditional Random Fields


# Dataset
We will be using a publicly available dataset consisting of receipts. Open a file in the below folders to look at the structure of the OCR outputs and annotations for the dataset we're using for this exercise.

In [146]:
import warnings
warnings.filterwarnings('ignore')

allOCRSet = "datasets/CleanOCR-all" # A folder with all OCR files
allAnnotationSet = "datasets/Annotations-all" # A folder with all Annotation files

# Introduction to Optical Character Recognition (OCR)
Use Pytesseract to apply OCR on a sample receipt image.

In [None]:
import pytesseract
from PIL import Image

image_path = "datasets/SplitYB.png" # Receipt image from a restaurant

# Send receipt image through tesseract OCR
def getOCRText(image_path):
    image_data = Image.open(image_path)
    image_text = pytesseract.image_to_string(image_data)
    return image_text

print(getOCRText(image_path))

# Text Labeling
Tokenize, Annotate and Tag a receipt for the total amount and merchant/business name

In [90]:
import pandas as pd
import os
import json

In [133]:
allNerDataset = "datasets/ner-dataset.csv"

def dataAnnotation(OCRset, annotationSet, nerDataset):
	allCleanOCRFiles = os.listdir(OCRset) # Load all Clean OCR Files
	allIXFiles = os.listdir(annotationSet) # Load all Clean OCR Files
	outfile = open(nerDataset, 'w')
	receiptNum = 1
	outfile.write('Receipt #,Word,POS,Tag' + '\n')

	for ixfile in allIXFiles:
		filename = os.fsdecode(ixfile)
		for ocrfile in allCleanOCRFiles:
			ocrFilename = os.fsdecode(ocrfile)
			if filename == ocrFilename and not filename.startswith('.'):
				ixData = {}
				ocrCleanLine = ""
				print(filename)
				with open(annotationSet + '/' + filename) as f1:
					ixData = json.load(f1)
				with open(OCRset + '/' + filename) as f2:
					ocrCleanLine = f2.readlines()
				annotationLine = ""
				wordNum = 0
				words = ocrCleanLine[0].replace(',', '').split(' ')
				isAmount = False
				isDate = False
				isMerchant = False
				beginMerchant = False
				beginDate = False

				merchantParts = ixData['company'].split(' ')
				dateParts = ixData['date'].split(' ')
				
				for word in words:
					annotationLine = ""
					word = word.replace(',', ' ')
					if wordNum == 0:
						if word == dateParts[0] and not isDate:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "DT" + ',' + 'B-date'
							isDate = True
							# Only expect a I-date if there are multiple parts to the date
							if len(dateParts) > 1:
								beginDate = True
							else:
								beginDate = False
						elif word == ixData['total'] and not isAmount:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "AM" + ',' + 'B-amt'
							isAmount = True
						elif word == merchantParts[0] and not isMerchant:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "MR" + ',' + 'B-mer'
							isMerchant = True
							beginMerchant = True
						else:
							annotationLine = 'Receipt: ' + str(receiptNum) + ',' + word + ',' + "NN" + ',' + 'O'
					else:
						#Date tagging
						if word == dateParts[0] and not isDate:
							annotationLine = ',' + word + ',' + "DT" + ',' + 'B-date'
							isDate = True
							# Only expect a I-date if there are multiple parts to the date
							if len(dateParts) > 1:
								# print("This date has many parts: " + ixData['date'])
								beginDate = True
							else:
								beginDate = False
						elif word in ixData['date'] and word != dateParts[0] and word != dateParts[-1] and beginDate:
							annotationLine = ',' + word + ',' + "DT" + ',' + 'I-date'
						elif word == dateParts[-1] and beginDate:
							beginDate = False
							annotationLine = ',' + word + ',' + "DT" + ',' + 'I-date'
						# Amount tagging
						elif word == ixData['total'] and not isAmount:
							annotationLine = ',' + word + ',' + "AM" + ',' + 'B-amt'
							isAmount = True

						# Merchant tagging
						elif word == merchantParts[0] and not isMerchant:
							beginMerchant = True
							isMerchant = True
							annotationLine = ',' + word + ',' + "MR" + ',' + 'B-mer'
						elif word in ixData['company'] and word != merchantParts[0] and word != merchantParts[-1] and beginMerchant:
							annotationLine = ',' + word + ',' + "MR" + ',' + 'I-mer'
						elif word == merchantParts[-1] and beginMerchant:
							beginMerchant = False
							annotationLine = ',' + word + ',' + "MR" + ',' + 'I-mer'

						#Other tagging
						else:
							annotationLine = ',' + word + ',' + "NN" + ',' + 'O'
					wordNum += 1						
					outfile.write(annotationLine + '\n')
				receiptNum += 1
	outfile.close
	print ('Annotations for %d files completed!' %receiptNum)
dataAnnotation(allOCRSet, allAnnotationSet, allNerDataset)

X51006555072.txt
X51006557117.txt
1083-receipt.txt
X51005568884.txt
X51005711441.txt
1191-receipt.txt
X51005806685.txt
X51008099041.txt
1050-receipt.txt
X51005303661.txt
1132-receipt.txt
X51006913055.txt
X51006332649.txt
X51005676545.txt
1020-receipt.txt
X51005719814.txt
1142-receipt.txt
X51007339151.txt
X51005722699.txt
X51005685355.txt
X51007339150.txt
X51005268200.txt
X51006828217.txt
X51005676544.txt
X51006466056.txt
X51006913054.txt
X51008099083.txt
X51007846368.txt
X51006387971.txt
X51006913068.txt
X51008099054.txt
1012-receipt.txt
X51006619772.txt
1170-receipt.txt
X51005301659.txt
X51005705804.txt
X51005568891.txt
X51006556591.txt
X51005711454.txt
X51007231336.txt
1062-receipt.txt
X51005763940.txt
1100-receipt.txt
X51006008197.txt
1007-receipt.txt
1198-receipt.txt
X51008145505.txt
X51006619566.txt
1165-receipt.txt
X51005711442.txt
X51005711456.txt
X51005301667.txt
1077-receipt.txt
X51008142068.txt
1115-receipt.txt
X51006387813.txt
X51006619758.txt
X51005677328.txt
X51006414429.t

X51007339110.txt
X51005719882.txt
X51006555806.txt
1153-receipt.txt
X51005719883.txt
X51007339111.txt
X51007339139.txt
X51005453729.txt
X51005749905.txt
X51005676539.txt
X51005442383.txt
X51007846301.txt
X51005442397.txt
X51005433552.txt
X51008164991.txt
1003-receipt.txt
X51006387850.txt
1161-receipt.txt
X51005889296.txt
X51005447833.txt
X51005711401.txt
1073-receipt.txt
X51006867435.txt
X51006414720.txt
1111-receipt.txt
X51006393376.txt
X51006679216.txt
X51006414718.txt
1156-receipt.txt
1034-receipt.txt
X51008123604.txt
X51006557184.txt
X00016469620.txt
X51006620161.txt
1126-receipt.txt
1044-receipt.txt
X51008114282.txt
X51005361895.txt
X51005442344.txt
X51006619694.txt
1185-receipt.txt
X51008123599.txt
X51008114321.txt
X51005442378.txt
X51006414485.txt
1178-receipt.txt
X51007846305.txt
X51006556852.txt
X51006619496.txt
X51005719893.txt
X51007339115.txt
X51006555817.txt
1097-receipt.txt
1108-receipt.txt
X51005719886.txt
X51007339100.txt
1114-receipt.txt
X51007339114.txt
X51005757243.t

# NER Model Training
Train a model using Conditional Random Fields algorithm with Named Entity Recognition concept to extract amount and merchant fields.

In [134]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.datasets import make_blobs

In [135]:
#Reading the csv file
df = pd.read_csv('datasets/ner-dataset.csv', encoding = "ISO-8859-1")

#Display first 10 rows
df.head(10)
df.describe()
#Displaying the unique Tags
df['Tag'].unique()
#Checking null values, if any.
df.isnull().sum()
df = df.fillna(method = 'ffill')

In [136]:
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Receipt #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Receipt: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None
        
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

#sentence with its pos and tag.
sent = getter.get_text()
# print(sent)
sentences = getter.sentences

## Feature Functions

In [137]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    for ele in s:  
        str1 += ele   
    
    # return string   
    return str1

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

## Train

In [138]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6)

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Test 

In [139]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

for i in range(len(y_pred)):
	prediction = y_pred[i]
	testList = X_test[i]
	testSentence = ""
	for testTuple in testList:
		testSentence = testSentence + testTuple['word.lower()'] + ' '
	words = testSentence.split(" ")
	x = 0
	for wordPrediction in prediction:
		if wordPrediction == 'B-date' or wordPrediction == 'B-amt' or wordPrediction == 'B-mer' or wordPrediction == 'I-mer' or wordPrediction == 'I-date':
			print(words[x],wordPrediction)
		x +=1

el B-mer
pollo I-mer
loco I-mer
restaurant I-mer
jun28'17 B-date
skca B-mer
hardware I-mer
& I-mer
timber I-mer
sdn. I-mer
bhd. I-mer
142.00 B-amt
01-nov-2017 B-date
triple B-mer
six I-mer
point I-mer
enterprise I-mer
666 I-mer
25-03-2018 B-date
7.60 B-amt
best B-mer
denki I-mer
malaysia I-mer
89.00 B-amt
01/05/16 B-date
brasas B-mer
restaurant I-mer
02-mar-2019 B-date
guardian B-mer
health I-mer
and I-mer
beauty I-mer
sdn I-mer
bhd I-mer
2.28 B-amt
18/05/18 B-date
general B-mer
poor's I-mer
tavern I-mer
04/10/16 B-date
pasaraya B-mer
borong I-mer
pintar I-mer
sdn I-mer
bhd I-mer
3.20 B-amt
chares I-mer
12/31/2018 B-date
caldera B-mer
brewery I-mer
& I-mer
restaurant I-mer
12/28/2017 B-date
mr. B-mer
d.i.y. I-mer
(kuchai) I-mer
sdn I-mer
bhd I-mer
17-04-16 B-date
restoran B-mer
wan I-mer
sheng I-mer
06-05-2018 B-date
5.00 B-amt
mcdonald's B-mer
restaurant I-mer
212/7/2013 B-date
7.27 B-amt
99 B-mer
speed I-mer
mart I-mer
s/b I-mer
07-02-17 B-date
37.45 B-amt
aco B-mer
restaurants I-mer

sdn I-mer
bhd I-mer
27/07/2017 B-date
94.19 B-amt
mr. B-mer
d.i.y. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
06-04-18 B-date
gardenia B-mer
bakeries I-mer
(kl) I-mer
sdn I-mer
bhd I-mer
14/10/2017 B-date
33.80 B-amt
yum's B-mer
restaurant I-mer
6/24/2018 B-date
22.03 B-amt
03/22/13 B-date
dining B-mer
room I-mer
t5 I-mer
main I-mer
dining I-mer
delmonico's B-mer
hallan I-mer
steakhouse I-mer
08/22/2015 B-date
ikano B-mer
handel I-mer
sdn I-mer
bhd I-mer
23/12/17 B-date
73.30 B-amt
artrange B-mer
stationers I-mer
& I-mer
print I-mer
sdn I-mer
bhd I-mer
10/04/2017 B-date
15.40 B-amt
kmf B-mer
foodicious I-mer
sdn I-mer
bhd I-mer
09-04-2018 B-date
31.00 B-amt
pasir B-mer
emas I-mer
hardware I-mer
sdn I-mer
bhd I-mer
23/06/18 B-date
36.00 B-amt
restoran B-mer
wan I-mer
sheng I-mer
20-06-2018 B-date
9.60 B-amt
99 B-mer
speed I-mer
mart I-mer
s/b I-mer
27.90 B-amt
indah B-mer
gift I-mer
& I-mer
home I-mer
deco I-mer
19/10/2018 B-date
60.30 B-amt
kedai B-mer
papan I-mer
yew I-mer
chuan I-mer
06/03/

12/05/2018 B-date
75.70 B-amt
gardenia B-mer
bakeries I-mer
(kl) I-mer
sdn I-mer
bhd I-mer
10/10/2017 B-date
27.20 B-amt
ohana B-mer
hawaiian I-mer
bbq I-mer
6/1/2013 B-date
super B-mer
seven I-mer
cash I-mer
& I-mer
carry I-mer
sdn I-mer
bhd I-mer
12-02-2018 B-date
rm367.10 B-amt
king's B-mer
confectionery I-mer
s/b I-mer
19.20 B-amt
mr. B-mer
d.i.y. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
5.00 B-amt
30-04-18 B-date
farmasi B-mer
maluri I-mer
s/b I-mer
02/03/18 B-date
79.35 B-amt
gardenia B-mer
bakeries I-mer
(kl) I-mer
sdn I-mer
bhd I-mer
25/07/2017 B-date
24.83 B-amt
l B-mer
two I-mer
florist I-mer
& I-mer
handicraft I-mer
9.00 B-amt
21/03/2018 B-date
perniagaan B-mer
riang I-mer
ria I-mer
22.60 B-amt
10-05-2017 B-date
amtech B-mer
electrical I-mer
supplies I-mer
27/06/18 B-date
136.00 B-amt
segi B-mer
cash I-mer
& I-mer
carry I-mer
sdn. I-mer
bhd. I-mer
25 B-date
may I-date
2017 I-date
118.35 B-amt
western B-mer
eastern I-mer
stationery I-mer
sdn. I-mer
bhd I-mer
30-04-2018 B-date
unih

bhd I-mer
20/11/2017 B-date
39.80 B-amt
checkers B-mer
hypermarket I-mer
sdn I-mer
bhd I-mer
(jalan I-mer
klang I-mer
lama) I-mer
20.00 B-amt
popular B-mer
book I-mer
co. I-mer
(m) I-mer
sdn I-mer
bhd I-mer
11/02/18 B-date
11.80 B-amt
crisfield B-mer
seafood I-mer
restaurant I-mer
12/29/2017 B-date
subang B-mer
healthcare I-mer
sdn I-mer
29/10/2017 B-date
s B-mer
s B-mer
28.90 B-amt
are I-mer
bhd I-mer
99 B-mer
speed I-mer
mart I-mer
s/b I-mer
05-02-18 B-date
141.50 B-amt
nick's B-mer
tuscan I-mer
grill I-mer
mar18'18 B-date
broadview B-mer
marketing I-mer
sdn I-mer
15/05/2017 B-date
4.70 B-amt
again. I-mer
02/11/19 B-date
unihakka B-mer
international I-mer
sdn I-mer
bhd I-mer
09 B-date
jun I-date
2018 I-date
rm7.70 B-amt
hms B-mer
esquire I-mer
grille I-mer
apr04 B-date
16 I-date
hon B-mer
hwa I-mer
hardware I-mer
trading I-mer
21/09/2017 B-date
10.40 B-amt
the B-mer
golden I-mer
aug06' B-date
15 I-date
22.70 B-amt
lasource B-mer
02-08-2015 B-date
ann B-mer
giap I-mer
trading I-mer
sd

## Generate Report
Print the results of how your model was trained and tested to verify

In [140]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

report = flat_classification_report(y_test, y_pred)
print(report)

0.9995275565513414




              precision    recall  f1-score   support

       B-amt       1.00      1.00      1.00       392
      B-date       1.00      1.00      1.00       455
       B-mer       0.96      0.99      0.98       465
      I-date       1.00      1.00      1.00        93
       I-mer       1.00      0.99      0.99      1413
           O       1.00      1.00      1.00     45642

    accuracy                           1.00     48460
   macro avg       0.99      1.00      0.99     48460
weighted avg       1.00      1.00      1.00     48460



# Practical Application

Lets now test a new receipt that the machine learning model hasn't encountered before. You can upload a receipt you have handy or use the lunch receipt we've provided in the datasets directory.

## Test End to End by uploading a new image - OCR to NER

In [141]:
new_image_path = "datasets/lunch.png"
test_file = getOCRText(new_image_path)
print(test_file)

test_ocr_sentence = test_file.replace("\n", " ")
print(test_ocr_sentence)


Bon Appetit
Oracle Redwood
Cafe 300

ES
rm CHK 2042
6/14/2018 | 1.05 PM
1! & g Salad, Large Ceasar 5 09
1 Side, Chicken 2.39

 

eeeERKHK( 03

VISA:087861

Credit Authorization 6,13
VISA $8.13
EKKKKKKKKKKD 4D

Subtota | $7.48
Tax $0.65
Payment $8.13

Change Due $O.00

Seeerterene Check Closed -----------
6/14/2018 1:05 PM

   

Bon Appetit Oracle Redwood Cafe 300  ES rm CHK 2042 6/14/2018 | 1.05 PM 1! & g Salad, Large Ceasar 5 09 1 Side, Chicken 2.39     eeeERKHK( 03  VISA:087861  Credit Authorization 6,13 VISA $8.13 EKKKKKKKKKKD 4D  Subtota | $7.48 Tax $0.65 Payment $8.13  Change Due $O.00  Seeerterene Check Closed ----------- 6/14/2018 1:05 PM      


## Annotate the test image

1. Add the OCR output from above to the OCR file in CleanOCR-test directory
2. Add the corresponding annotations to a file in the Annotations-test directory

Ensure that the 2 files have identical names in order for the next step to work as expected.

In [143]:
testOCRSet = "datasets/CleanOCR-test" # A folder with all OCR files
testAnnotationSet = "datasets/Annotations-test" # A folder with all Annotation files
testNerDataset = "datasets/ner-dataset-test.csv"

dataAnnotation(testOCRSet,testAnnotationSet, testNerDataset)

lunch.txt
Annotations for 2 files completed!


In [144]:
def get_test_sentence(test_file):
    #Reading the csv file
    trial = pd.read_csv(test_file, encoding = "ISO-8859-1")

    #Display first 10 rows
    trial.head(10)
    trial.describe()
    #Displaying the unique Tags
    trial['Tag'].unique()
    #Checking null values, if any.
    trial.isnull().sum()
    trial = trial.fillna(method = 'ffill')
    trial_getter = sentence(trial) # Check last token - seems to be repeating 
    trial_sentences = [" ".join([s[0] for s in sent]) for sent in trial_getter.sentences]
    trial_sentences[0]

    #sentence with its pos and tag.
    trial_sent = trial_getter.get_text()
    trial_sentences = trial_getter.sentences
    return trial_sentences

trial_sentences = get_test_sentence(testNerDataset)

In [145]:
tX = [sent2features(s) for s in trial_sentences]
ty = [sent2labels(s) for s in trial_sentences]

#Predicting on the one test set.
X_test = tX
y_test = ty
y_pred = crf.predict(X_test)

f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

report = flat_classification_report(y_test, y_pred)
print(report)

1.0
              precision    recall  f1-score   support

      B-date       1.00      1.00      1.00         1
       B-mer       1.00      1.00      1.00         1
       I-mer       1.00      1.00      1.00         1
           O       1.00      1.00      1.00        60

    accuracy                           1.00        63
   macro avg       1.00      1.00      1.00        63
weighted avg       1.00      1.00      1.00        63



