In [1]:
import fitz
import pandas as pd 

In [2]:
# This func select the words contained in the box,sort the words and return in form of string
def make_text(words):
    line_dict = {} 
    words.sort(key=lambda w: w[0])
    for w in words:  
        y1 = round(w[3], 1)  
        word = w[4] 
        line = line_dict.get(y1, [])  
        line.append(word)  
        line_dict[y1] = line  
    lines = list(line_dict.items())
    lines.sort()  
    return "\n".join([" ".join(line[1]) for line in lines])

In [3]:
#PDF file object
doc = fitz.open('unstructured.pdf')

## For 1st page and 1 annot

In [4]:
#First page will be extracted
page1 = doc[0]

In [5]:
# Extract all words of the page
words = page1.get_text("words")

In [6]:
words[0]

(373.25,
 246.29598999023438,
 402.6600341796875,
 253.18099975585938,
 'DIRECTION',
 0,
 0,
 0)

In [7]:
first_annots=[]

In [8]:
#Extract the coordinates of first object
rec=page1.first_annot.rect
rec

Rect(366.260009765625, 27.6500244140625, 573.97998046875, 61.3800048828125)

In [9]:
#Information of words in first object is stored in mywords
mywords = [w for w in words if fitz.Rect(w[:4]) in rec]
ann= make_text(mywords)
first_annots.append(ann)
print(first_annots)

['LOCAL REPORT NUMBER *\n2 1 0 0 9 0 4 8']


In [11]:
len(first_annots)

1

In [12]:
print(first_annots)

['LOCAL REPORT NUMBER *\n2 1 0 0 9 0 4 8']


## For all annots and all pages of PDF

In [13]:
all_annots=[]

In [14]:
# Extracting each page of document and all the annots/rectanges
for pageno in range(0,len(doc)-1):
    page = doc[pageno]
    words = page.get_text("words")
    for annot in page.annots():
        if annot!=None:
            rec=annot.rect
            mywords = [w for w in words if fitz.Rect(w[:4]) in rec]
            ann= make_text(mywords)
            all_annots.append(ann)

In [15]:
#len(all_annots)

In [16]:
print(all_annots)

['LOCAL REPORT NUMBER *\n2 1 0 0 9 0 4 8', 'CRASH SEVERITY\nFATAL\n1-\nSERIOUS INJURY\nSUSPECTED\n2-\n5\nMINOR INJURY\n3-\nSUSPECTED\nINJURY POSSIBLE\n4-\nPROPERTY DAMAGE\n5-\nONLY', 'NUMBER OF UNITS\n0 1', 'CRASH DATE/TIME *\n0 4 2 5 2 0 2 1 0 5 2 8', 'LOCATION: CITY, VILLAGE, TOWNSHIP *\nMANSFIELD', 'COUNTY *\n7 0', 'LOCALITY *\n1. CITY\n2. VILLAGE\n1\n3. TOWNSHIP', 'LATITUDE\n4 0 7 9 0 6 2 0\n.', 'LONGITUDE\n-8 2 5 1 3 2 1 0\n.', 'MANNER OF CRASH COLLISION/IMPACT\n1 - NOT COLLISSION\n4 - REAR-TO-REAR\nBETWEEN\n5 - BACKING\nTWO MOTOR\n6 - ANGLE\nVEHICLES IN\n7 - SIDESWIPE, SAME DIRECTION\nTRANSPORT\n8 - SIDESWIPE, OPPOSITE DIRECTION\n2 - REAR-END\n9 - OTHER / UNKNOWN\n3 - HEAD-ON\n1', "OFFICER'S NAME*\nCAROLYN YOUNG", "OFFICER'S BADGE NUMBER*\n1 7 5 4", 'VEHICLE IDENTIFICATION #\n2 H K R W 6 H 3 1 J H 2 0 9 8 1 1', 'VEHICLE YEAR\n2 0 1 8', 'VEHICLE MAKE\nHONDA', 'VEHICLE MODEL\nCRV', 'COLOR\nSILVER', 'INSURANCE COMPANY\nPROGRESSIVE', 'LP STATE\nO H', 'LICENSE PLATE#\nJHW3885', '# OCC

## Cleaning for excel

In [17]:
cont=[]

In [18]:
# Splitting to form column name and its values
for i in range(0,len(all_annots)):
    cont.append(all_annots[i].split('\n',1))

In [19]:
# Removing unnecessary symbols *,#,:
liss=[]
for i in range(0,len(cont)):
    lis=[]
    for j in cont[i]:
        j=j.replace('*','')
        j=j.replace('#','')
        j=j.replace(':','')
        j=j.strip()
        #print(j)
        lis.append(j)
    liss.append(lis)

In [20]:
# Spliting into keys and values
keys=[]
values=[]
for i in liss:
    keys.append(i[0])
    values.append(i[1])

In [21]:
# Removing spaces in the values which only contain digits
for i in range(0, len(values)):
    for j in range(0,len(values[i])):
        if values[i][j]>='A' and values[i][j]<='Z':
            break            
    if j==len(values[i])-1:
        values[i]=values[i].replace(' ','')

In [22]:
# converting to dictionay
report=dict(zip(keys,values))

In [23]:
report['VEHICLE IDENTIFICATION']=report['VEHICLE IDENTIFICATION'].replace(' ','')
report

{'LOCAL REPORT NUMBER': '21009048',
 'CRASH SEVERITY': 'FATAL\n1-\nSERIOUS INJURY\nSUSPECTED\n2-\n5\nMINOR INJURY\n3-\nSUSPECTED\nINJURY POSSIBLE\n4-\nPROPERTY DAMAGE\n5-\nONLY',
 'NUMBER OF UNITS': '01',
 'CRASH DATE/TIME': '042520210528',
 'LOCATION CITY, VILLAGE, TOWNSHIP': 'MANSFIELD',
 'COUNTY': '70',
 'LOCALITY': '1. CITY\n2. VILLAGE\n1\n3. TOWNSHIP',
 'LATITUDE': '40790620\n.',
 'LONGITUDE': '-82513210\n.',
 'MANNER OF CRASH COLLISION/IMPACT': '1 - NOT COLLISSION\n4 - REAR-TO-REAR\nBETWEEN\n5 - BACKING\nTWO MOTOR\n6 - ANGLE\nVEHICLES IN\n7 - SIDESWIPE, SAME DIRECTION\nTRANSPORT\n8 - SIDESWIPE, OPPOSITE DIRECTION\n2 - REAR-END\n9 - OTHER / UNKNOWN\n3 - HEAD-ON\n1',
 "OFFICER'S NAME": 'CAROLYN YOUNG',
 "OFFICER'S BADGE NUMBER": '1754',
 'VEHICLE IDENTIFICATION': '2HKRW6H31JH209811',
 'VEHICLE YEAR': '2018',
 'VEHICLE MAKE': 'HONDA',
 'VEHICLE MODEL': 'CRV',
 'COLOR': 'SILVER',
 'INSURANCE COMPANY': 'PROGRESSIVE',
 'LP STATE': 'O H',
 'LICENSE PLATE': 'JHW3885',
 'OCCUPANTS': '1',


In [24]:
# Handling the dictionary key-pair columns and extracting value based on key
dic=[report['LOCALITY'],report['MANNER OF CRASH COLLISION/IMPACT'],report['CRASH SEVERITY']]
l=0
val_after=[]
for local in dic:
    li=[]
    lii=[]
    k=''
    extract=''
    l=0
    for i in range(0,len(local)-1):
        if local[i+1]>='0' and local[i+1]<='9':
            li.append(local[l:i+1])
            l=i+1
    li.append(local[l:])
    print(li)
    for i in li:
        if i[0] in lii:
            k=i[0]
            break
        lii.append(i[0])
        
    for i in li:
        if i[0]==k:
            extract=i
            val_after.append(extract)
            break
report['LOCALITY']=val_after[0]
report['MANNER OF CRASH COLLISION/IMPACT']=val_after[1]
report['CRASH SEVERITY']=val_after[2]

['1. CITY\n', '2. VILLAGE\n', '1\n', '3. TOWNSHIP']
['1 - NOT COLLISSION\n', '4 - REAR-TO-REAR\nBETWEEN\n', '5 - BACKING\nTWO MOTOR\n', '6 - ANGLE\nVEHICLES IN\n', '7 - SIDESWIPE, SAME DIRECTION\nTRANSPORT\n', '8 - SIDESWIPE, OPPOSITE DIRECTION\n', '2 - REAR-END\n', '9 - OTHER / UNKNOWN\n', '3 - HEAD-ON\n', '1']
['FATAL\n', '1-\nSERIOUS INJURY\nSUSPECTED\n', '2-\n', '5\nMINOR INJURY\n', '3-\nSUSPECTED\nINJURY POSSIBLE\n', '4-\nPROPERTY DAMAGE\n', '5-\nONLY']


In [25]:
# Removing next line character from the dictionary and converting to form to convert it to dataframe
for key in report:
    report[key]=report[key].replace('\n',' ')
    report[key]=report[key].strip()
    report[key]=[(report[key])]

## Exporting to Exel

In [26]:
# Converting to DataFrame
data=pd.DataFrame.from_dict(report)

In [27]:
data

Unnamed: 0,LOCAL REPORT NUMBER,CRASH SEVERITY,NUMBER OF UNITS,CRASH DATE/TIME,"LOCATION CITY, VILLAGE, TOWNSHIP",COUNTY,LOCALITY,LATITUDE,LONGITUDE,MANNER OF CRASH COLLISION/IMPACT,...,INSURANCE COMPANY,LP STATE,LICENSE PLATE,OCCUPANTS,UNIT,NAME,DATE OF BIRTH,AGE,GENDER,ADDRESS
0,21009048,5 MINOR INJURY,1,42520210528,MANSFIELD,70,1. CITY,40790620 .,-82513210 .,1 - NOT COLLISSION,...,PROGRESSIVE,O H,JHW3885,1,1,"LAST, FIRST, MIDDLE STANTO, QUINN NICOLE",4131980,41,F,"CITY, STATE, ZIP 1754 BROWNSTONE BLVD - H, TOL..."


In [28]:
#Exporting to CSV
data.to_csv('result.csv',index=False)