In [5]:
# script for extracting ANTECH CBC info from pdf
# based on pdfplumber @https://github.com/jsvine/pdfplumber
# by Zac Zou

# modules required
import numpy as np
import pandas as pd
import pdfplumber

# open results in pdf
file = "SD23-1-1 to SD23-1-41 CBC Y2300229.pdf"
pdf = pdfplumber.open(file)

# items blacklist
items_black = ['Differential', 'Blood Parasites', 'Platelet Estimate']

# initialize results dataframe
result = pd.DataFrame()

# collect info page by page
for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    
    # crop the page into 2 parts that contain tables
    ID = page.crop(
        (0.16*float(page.width), 0.168*float(page.height), 0.24*float(page.width), 0.182*float(page.height))
        ).extract_text()
    info = page.crop(
        (0.072*float(page.width), 0.245*float(page.height), 0.5*float(page.width), 0.7*float(page.height))
        ).extract_text(layout = True).split('\n')
    
    # normalize info format
    try: tail_index = info.index('Comment                             ')
    except ValueError:
        print(f'Skipping page {i+1}, which is not in a standard format. Consider entering the values manually. \n(Mice ID: {ID})')
        continue
    info = info[:tail_index]
        
    # add collected info to dataframe
    info_dict = {}
    for count in info: 
        # collect info by line
        count_list = count.split('  ', 1)
        key = count_list[0].strip()
        try: value = [n.strip() for n in ''.join(count_list[1].strip().split(',')).split('  ', 1)]
        except IndexError: continue
        if key in items_black: continue
        # write into dataframe
        if len(value) == 1:
            info_dict[key] = value[0]
        elif len(value) == 2:
            info_dict[key+' (absolute)'] = value[0]
            info_dict[key+' (percentage)'] = value[1]        
    result[ID] = info_dict
        
# final clean-up
result.replace('', np.nan, inplace = True)
result.dropna(inplace = True)
result.to_excel(file.strip('.pdf') + '.xlsx')
print(f'\nDataframe generated. Excel file exported. ')
result


Dataframe generated. Excel file exported. 


Unnamed: 0,Sd23-1-1,Sd23-1-2,Sd23-1-3,Sd23-1-4,Sd23-1-5,Sd23-1-6,Sd23-1-7,Sd23-1-8,Sd23-1-9,Sd23-1-10,...,Sd23-1-32,Sd23-1-33,Sd23-1-34,Sd23-1-35,Sd23-1-36,Sd23-1-37,Sd23-1-38,Sd23-1-39,Sd23-1-40,Sd23-1-41
WBC,3.1 (LOW),4.9 (LOW),1.2 (LOW),3.6 (LOW),3.9 (LOW),6.6,5.5,5.6,4.7 (LOW),5.4 (LOW),...,7.1,5.2 (LOW),7.2,10.8,5.4 (LOW),7.6,9.1,11.4 (HIGH),11.0,7.1
RBC,7.9,7.9,9.1,9.0,9.7,8.7,9.5,8.8,8.5,8.0,...,8.7,8.9,8.8,8.7,9.3,8.7,8.6,8.6,8.7,9.8
HGB,14.1,14.7,14.1,15.5,15.1,14.6,14.6,14.1,14.6,13.5,...,14.2,15.2,15.4,14.8,15.4,15.4,15.3,15.0,14.3,15.7
HCT,48,48,49,55 (HIGH),53 (HIGH),52 (HIGH),51 (HIGH),50.0,51 (HIGH),49,...,51 (HIGH),53 (HIGH),53 (HIGH),52 (HIGH),54 (HIGH),52 (HIGH),52 (HIGH),53 (HIGH),52 (HIGH),55 (HIGH)
MCV,61,61,54,61,55,60,54,57.0,59,61,...,59,59,61,60,58,60,60,62,60,56
MCH,17.8,18.6,15.4,17.1,15.6,16.8,15.4,16.0,17.1,16.8,...,16.2,17.0,17.7,16.9,16.6,17.8,17.8,17.4,16.5,16.0
MCHC,29,31,29,28,29,28,28,28.0,29,28,...,28,29,29,28,29,30,30,28,27,28
Platelet Count,2450,2576,675,2178,1698,2098,1870,1985.0,2969,2258,...,2194,2802,2251,1484,2168,2882,3387,2004,1268,1830
Neutrophils (absolute),434,931,228,180,195,198,220,280.0,188,324,...,568,312,576,108,270,380,819,570,660,568
Neutrophils (percentage),14,19,19,5,5,3,4,5.0,4,6,...,8,6,8,1,5,5,9,5,6,8
