In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [3]:
# Load NER model

model_ner = spacy.load(r"C:\InternProject\Output\model-best")

In [4]:
# Load Image
image = cv2.imread('./Data/6.jpg')

# cv2.imshow('BusinessCard',image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

# Extract Data using Pytesseract
tessData=pytesseract.image_to_data(image)

# Convert to Dataframe
tessList=list(map(lambda x: x.split('\t'), tessData.split('\n')))
df= pd.DataFrame(tessList[1:],columns=tessList[0])
df.dropna(inplace=True)  #->Drop any missing values
df['text'] = df['text'].apply(cleanText)

# Convert data into content
df_clean=df.query('text!=""')
content = " ".join(df_clean['text'].tolist())
# print(content)

# Prediction from NER Model
doc=model_ner(content)

In [5]:
from spacy import displacy
import os
import webbrowser

html = displacy.render(doc, style="ent", jupyter=False)

output_path = os.path.abspath("ner_output.html")
with open(output_path, "w", encoding="utf-8") as f:
    f.write(html)

print(f"NER visualization saved to: {output_path}")
webbrowser.open(f"file://{output_path}")


NER visualization saved to: C:\InternProject\ner_output.html


True

In [6]:
### Tagging

In [7]:
docjson=doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [8]:
doc_text=docjson['text']
doc_text

'cell 8099948528 ga 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail “com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad - 500 016. add. borabanda, hyderabad - 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo'

In [9]:
dataframe_tokens=pd.DataFrame(docjson['tokens'])
dataframe_tokens['token']=dataframe_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]],axis=1)
dataframe_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,4,cell
1,1,5,15,8099948528
2,2,16,18,ga
3,3,19,29,8466045457
4,4,30,35,email
5,5,36,57,lictsrikant@gmail.com
6,6,58,62,life
7,7,63,72,insurance
8,8,73,84,corporation
9,9,85,87,of


In [10]:
right_table=pd.DataFrame(docjson['ents'])[['start','label']]
dataframe_tokens=pd.merge(dataframe_tokens,right_table, how='left', on='start')

In [11]:
dataframe_tokens.fillna('O', inplace=True)
dataframe_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,4,cell,O
1,1,5,15,8099948528,B-PHONE
2,2,16,18,ga,O
3,3,19,29,8466045457,B-PHONE
4,4,30,35,email,O
5,5,36,57,lictsrikant@gmail.com,B-EMAIL
6,6,58,62,life,B-ORG
7,7,63,72,insurance,I-ORG
8,8,73,84,corporation,I-ORG
9,9,85,87,of,I-ORG


In [12]:
df_clean.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
12,5,1,3,1,1,1,722,53,64,28,90.167198,cell
14,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528
17,5,1,3,2,1,1,55,55,85,89,47.185547,ga
18,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457
20,5,1,3,2,2,1,593,136,93,25,89.28978,email


In [13]:
# Join Label to df_clean DataFrame

df_clean['end']=df_clean['text'].apply(lambda x: len(x)+1).cumsum()-1
df_clean['start']=df_clean[['text','end']].apply(lambda x: x[1]-len(x[0]),axis=1)
df_clean.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start
12,5,1,3,1,1,1,722,53,64,28,90.167198,cell,4,0
14,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528,15,5
17,5,1,3,2,1,1,55,55,85,89,47.185547,ga,18,16
18,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457,29,19
20,5,1,3,2,2,1,593,136,93,25,89.28978,email,35,30
22,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36
25,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58
26,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63
27,5,1,3,3,1,3,183,170,117,14,96.909729,corporation,84,73
28,5,1,3,3,1,4,306,170,20,14,96.172005,of,87,85


In [14]:
# Inner Join with start

dataframe_info = pd.merge(df_clean, dataframe_tokens[['start','token','label']],how='inner',on='start')
dataframe_info.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,3,1,1,1,722,53,64,28,90.167198,cell,4,0,cell,O
1,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528,15,5,8099948528,B-PHONE
2,5,1,3,2,1,1,55,55,85,89,47.185547,ga,18,16,ga,O
3,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457,29,19,8466045457,B-PHONE
4,5,1,3,2,2,1,593,136,93,25,89.28978,email,35,30,email,O
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,B-EMAIL
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,B-ORG
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,I-ORG
8,5,1,3,3,1,3,183,170,117,14,96.909729,corporation,84,73,corporation,I-ORG
9,5,1,3,3,1,4,306,170,20,14,96.172005,of,87,85,of,I-ORG


In [15]:
# dataframe_info.tail(10)

In [16]:
# Bounding Box

In [17]:
bb_df=dataframe_info.query('label != "O" ')
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
1,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528,15,5,8099948528,B-PHONE
3,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457,29,19,8466045457,B-PHONE
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,B-EMAIL
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,B-ORG
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,I-ORG


In [18]:
img=image.copy()

for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,0),2)
    
    
cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [19]:
bb_df['label']=bb_df['label'].apply(lambda x:x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
1,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528,15,5,8099948528,PHONE
3,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457,29,19,8466045457,PHONE
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,ORG
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,ORG


In [20]:
# group the label
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
        
grp_gen = groupgen()

In [21]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [22]:
### ✅ Instead of using class we can also aplly a function on the DataFrame

last_text = ''
group_id = 0

def group(text):
    global last_text, group_id
    if last_text == text:
        return group_id
    else:
        group_id += 1
        last_text = text
        return group_id



In [23]:
bb_df['group'] = bb_df['label'].apply(group)


In [24]:
# right and bottom of bounding box
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

In [25]:
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group,right,bottom
1,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528,15,5,8099948528,PHONE,1,1025,81
3,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457,29,19,8466045457,PHONE,1,1025,123
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL,2,1025,167
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,ORG,3,79,184
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,ORG,3,177,193


In [26]:
# tagging: groupby group
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')
group_tag_img

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000026B09C170E0>

In [27]:
# Aggregate (agg) function cannot be directly applied to DataFrame it is applied to a GroupBy object

img_tagging = group_tag_img.agg({
    
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x: " ".join(x)
    
})

In [28]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,822,1025,53,123,[PHONE],8099948528 8466045457
2,709,1025,136,167,[EMAIL],lictsrikant@gmail.com
3,46,374,151,193,[ORG],life insurance corporation of india
4,668,1025,165,209,[EMAIL],seosrika ntht@gmail “
5,310,755,227,259,[NAME],thathineni srikanth
6,399,669,271,296,[DES],insurance advisor
7,47,882,395,427,[ORG],life insurance corporation of india
8,46,917,506,533,[WEB],lictsrikant8099948528.blogspot.in interviewsin...


In [29]:
# Converting label col from NumPy array to a single string
img_tagging['label']=img_tagging['label'].apply(lambda x:x[0])
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,822,1025,53,123,PHONE,8099948528 8466045457
2,709,1025,136,167,EMAIL,lictsrikant@gmail.com
3,46,374,151,193,ORG,life insurance corporation of india
4,668,1025,165,209,EMAIL,seosrika ntht@gmail “
5,310,755,227,259,NAME,thathineni srikanth
6,399,669,271,296,DES,insurance advisor
7,47,882,395,427,ORG,life insurance corporation of india
8,46,917,506,533,WEB,lictsrikant8099948528.blogspot.in interviewsin...


In [30]:
img_bb = image.copy()
for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb,(l,t),(r,b),(0,255,0),2)
    
    cv2.putText(img_bb,label,(l,t),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Bounding Box BusinessCard',img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [31]:
# Parsing Data/ Parser

In [32]:
def parser(text,label):
    if label == 'PHONE':
        text = text.lower()
        text = re.sub(r'\D','',text)
        
    elif label == 'EMAIL':
        text = text.lower()
        allow_special_char = '@_.\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'',text)
        
    elif label == 'WEB':
        text = text.lower()
        allow_special_char = ':/.%#\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'',text)
        
    elif label in ('NAME', 'DES'):
        text = text.lower()
        text = re.sub(r'[^a-z ]','',text)
        text = text.title()
        
    elif label == 'ORG':
        text = text.lower()
        text = re.sub(r'[^a-z0-9 ]','',text)
        text = text.title()
        
    return text

In [33]:
info_array = dataframe_info[['token','label']].values
entities = dict(NAME=[],ORG=[],DES=[],PHONE=[],EMAIL=[],WEB=[])
previous = 'O'

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]
    
    # step -1 parse the token
    text = parser(token,label_tag)
    
    if bio_tag in ('B','I'):
        
        if previous != label_tag:
            entities[label_tag].append(text)
            
        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
                
            else:
                if label_tag in ("NAME",'ORG','DES'):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                    
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text
                    
    
    
    previous = label_tag


In [34]:
entities

{'NAME': ['Thathineni Srikanth'],
 'ORG': ['Life Insurance Corporation Of India',
  'Life Insurance Corporation Of India'],
 'DES': ['Insurance Advisor'],
 'PHONE': ['8099948528', '8466045457'],
 'EMAIL': ['lictsrikant@gmail.com', 'seosrikantht@gmail'],
 'WEB': ['lictsrikant8099948528.blogspot.in',
  'interviewsinhyderabad.blogspot.in']}