Load relevant modules.

In [1]:
import pytictoc
t = pytictoc.TicToc()
t.tic()

In [2]:
from PIL import Image 
import pytesseract 
import sys 
from pdf2image import convert_from_path, convert_from_bytes
import os 
from pymongo import MongoClient
import pandas as pd
import re
import gridfs
import datetime
#do not print warnings
import warnings
warnings.filterwarnings('ignore')

connect to local MongoDB

In [3]:
client = MongoClient("localhost", 27017)
db = client["Batch_GI"]
coll = "Batch_GI"
db_coll = db[coll]

#gridfs is used for saving binary files to MongoDB
fs = gridfs.GridFS(db)

Determine rlevant file paths (for the pdf files).

In [4]:
path = "/home/controllingde/G/Batch_GI/"

In [5]:
files = os.listdir(path)

In [7]:
#find all pdf files
pdffiles = [path + x for x in files if x.find(".pdf") > 0]

Create an empty pandas dataframe for later usage. This dataframe will contain the structured data for all pdfs and will be exported to a csv afterwards.

In [8]:
colnames = ['batch', 'item', '_id', 'FS', 'GI', 'MHD', 'filename']
total_df = pd.DataFrame(columns=colnames)

This is the main part. 

In [9]:
datepattern = ".(1[0-2]|0[1-9]|\d)\/([2-9]\d[1-9]\d|[1-9]\d)."
batchsign = "Ch.-B.:"
i = 1
tl = len(pdffiles)

timestamp = str(datetime.datetime.now()).replace(" ", "_")
timestamp = timestamp.replace(":", "-")
timestamp = timestamp[0:19]

new_filename = "Matching_Batch_" + timestamp + ".csv"

for f in pdffiles:
    
    t2 = pytictoc.TicToc()
    t2.tic()
    
    pages = convert_from_path(f, 600)     
    
    image_counter = 1
    
    for p in pages:
        filename = "page_" + str(image_counter) + ".jpg"
        p.save(filename, "JPEG")
        image_counter += 1
    filelimit = image_counter-1
    
    impdict = {}
    for i in range(1, filelimit + 1):
        filename = "page_"+str(i)+".jpg"
        text = str(((pytesseract.image_to_string(Image.open(filename)))))
        text = text.replace('-\n', '')  
    
        batchind_start = text.rindex(batchsign)
    
        batchsub1 = text[(batchind_start + len(batchsign) + 1) : ]
        impdict["batch"] = batchsub1[0: batchsub1.find(" ")]
        impdict["item"] = batchsub1[(batchsub1.find(" ") + 1): batchsub1.find("\n")]
    
        impdict["_id"] = impdict["batch"] + "_" + impdict["item"]
    
        FSsub = batchsub1[batchsub1.find("FS-"): ]
        impdict["FS"] = FSsub[0: FSsub.find("\n")]

        GIsub = batchsub1[batchsub1.find("GI-"): ]
        impdict["GI"] = GIsub[0: GIsub.find("\n")]

        #FOLsub = batchsub1[batchsub1.find("FOL-"): ]
        #impdict["FOL"] = FOLsub[0: FOLsub.find("\n")]
        
        MHD = re.search(datepattern, text)
        impdict["MHD"] = text[MHD.start() : MHD.end()-1]
             
        impdict["filename"] = f
        impdict["fulltext"] = text.split("\n")
        
        impdict["pdffile"] = fs.put(open(f, 'rb'))
        
    db_coll.remove({"_id": impdict["_id"]})
    db_coll.insert_one(impdict)
    
    del impdict["fulltext"]
    del impdict["pdffile"]
    pre_df = pd.DataFrame.from_dict(impdict, orient="index").transpose()
    total_df = pd.concat([total_df, pre_df])
    
    print("{} of {} processed".format(i, tl))
    
    i += 1
    
    t2.toc()
    
total_df.to_csv(path + new_filename)
total_df.set_index(["_id"], inplace=True, verify_integrity=True)
t.toc()

1 of 2 processed
Elapsed time is 15.970626 seconds.
1 of 2 processed
Elapsed time is 14.259570 seconds.
Elapsed time is 14194.804630 seconds.


Apparently it takes about 15 seconds for one file and page to process. This us due since we are working here with PDF files, where a photo-copy (i.e. basically an image) is stored.
  
Let's take a look at the resulting dataframe.

In [10]:
total_df.head()

Unnamed: 0_level_0,batch,item,FS,GI,MHD,filename
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VY0519002-A_1010526,VY0519002-A,1010526,FS-1411-100-1117-02-APL,GI-1411-1117-01-APL,01/2022,/home/controllingde/G/Batch_GI/GItest01.pdf
PZ0418194-BU_ee Pe 4010807,PZ0418194-BU,ee Pe 4010807,"FS-1295-12x4_0,5-0718-01-APL",GI-1295-081 7-01-APL,10/2020,/home/controllingde/G/Batch_GI/GItest02.pdf


Finally let's have a look at the data in MongoDB.

In [12]:
#check mongodb
r = db_coll.find()
for l in r:
    print(l)
    print("\n")

{'_id': 'PZ0418194-BU_1010807', 'batch': 'PZ0418194-BU', 'item': '1010807', 'FS': 'FS-1295-12x4 0,5-0718-01-APL eked', 'GI': 'GI-1295-081 7-01-APL', 'MHD': ' 10/2020', 'filename': '/home/controllingde/G/Batch_GI/GItest02.pdf', 'fulltext': ['Nr.:', 'Version (V):', '', 'Ansichtsmusterkontrolle / Freisetzung Seite 1 von 1', '', 'Anlage zu AA-06002 und AA-06005', '', 'PUREN', '', 'PS Tes Phra la) Sips ge', '', ' ', '', 'Piperacillin/Tazobactam PUREN 4 g/0,5 g', '', 'Pulver zur Herstellung einer Infusionsl6sung', '', 'Ch.-B.: PZ0418194-BU 1010807', '', 'Verw. bis: 10/2020 12 Durchstechflaschen', 'groke:', 'Prufpunkte Datum/ Kurzel', 'Packmittelnummer: FS-1295-12x4 0,5-0718-01-APL eked', '', 'Aktuelle Version eingesetzt oO', 'Packmittelnummer: GI-1295-081 7-01-APL', '', 'Bezeichnung:', '', 'ee', '', '   ', ' ', '', 'Faltschachtel', '', '  ', '  ', '', 'a', '', 'Packungsbeilage', '', ' ', '  ', '   ', '', 'Aktuelle Version eingesetzt Oo 18. Olt. 2019,', '', '(', 'Bedrucktes Pri- Packmittelnum