In [10]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import concurrent.futures
import queue
import time
import tesserocr
#from multiprocessing import cpu_count, Pool
from pdf2image import convert_from_path
from pdf2image import convert_from_bytes
from langdetect import detect

# Import the source data
The data is provided as a directory that is three levels deep (the third level is ommited in the following listing).
``` bash
fiete@ubu:~/Documents/studium/analyse_semi_und_unstrukturierter_daten$ tree -d -L 1 CAPTUM
CAPTUM
├── Allergic Diseases
├── ANA
├── Angioedema
├── anti-FcεRI
├── Antihistamine
├── Anti-IgE
├── anti-TPO IgE ratio
├── ASST
├── Basophil
├── BAT
├── BHRA
├── CRP
├── Cyclosporine
├── D-Dimer
├── dsDNA
├── Duration
├── Eosinophil
├── IL-24
├── Omalizumab
├── Severity
├── Thyroglobulin
├── Total IgE
└── TPO
```

To work further with the source data, it is useful to have a list of file paths for the pdfs. The following creates a list of all pdf files in the `CAPTUM` source folder.

In [20]:
def get_filepaths(path):
    pdf_filepaths = []
    for root, directories, files in os.walk(path, topdown=False):
        for name in files:
            if name[-4:] == '.pdf':
                pdf_filepaths.append(os.path.join(root, name))
    return pdf_filepaths

def get_language(path):
        text = ''
        for p in range(0,5):
            image_path = path[:-4] + '_' + str(p) + '.jpg'
            if os.path.isfile(image_path):
                if len(text) <= 500:
                    text += tesserocr.file_to_text(image_path)
                else:
                    print('Reached required number of words for language detection after ' + str(p) + ' pages.')
                    break
            else:
                break
        return detect(text[:500]) # returns i.e en or de

tesserocr_queue = queue.Queue()

def perform_ocr(img):
    tess_api = None
    try:
        tess_api = tesserocr_queue.get(block=True, timeout=300)
        tess_api.SetImage(img)
        text = tess_api.GetUTF8Text()
        return text
    except tesserocr_queue.Empty:
        print('Empty exception caught!')
        return None
    finally:
        if tess_api is not None:
            tesserocr_queue.put(tess_api)

def run_threaded_ocr_on_pdf(ocr_images, num_threads, language):
    # Setup Queue
    for _ in range(num_threads):
        tesserocr_queue.put(tesserocr.PyTessBaseAPI(lang=language))

    # Perform OCR using ThreadPoolExecutor
    start = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        res = executor.map(perform_ocr, ocr_images)
    end = time.time()

    # Restoring queue
    for _ in range(num_threads):
        api = tesserocr_queue.get(block=True)
        api.End()

    tesserocr_queue.queue.clear()
    return (res, end - start)

def ocr_pdf(filepath, language, threads):
    # Pdf to image
    with open(filepath, 'rb') as raw_pdf:
        ocr_entities = convert_from_bytes(raw_pdf.read(), dpi=300, thread_count=4, grayscale=True)

    print(f'Starting OCR for file { os.path.basename(filepath) }')
    result_iterator, total_time = run_threaded_ocr_on_pdf(ocr_entities, threads, language)

    text = ''
    number_of_pages = 0
    for item in result_iterator:
        text += item
        number_of_pages += 1
    
    print(f'OCR finished in {str(total_time)} seconds with an average of {str(total_time / number_of_pages)} seconds per page.')
    return (text, number_of_pages)

## Initialize dataframe with pdf filepaths

In [12]:
root_dir = './CAPTUM'
df = pd.DataFrame(get_filepaths(root_dir), columns = ['filepath'])
df.head()

Unnamed: 0,filepath
0,./CAPTUM/CRP/ANA/Asero 2017.pdf
1,./CAPTUM/CRP/ANA/Magen 2015.pdf
2,./CAPTUM/CRP/Severity/Kolkhir 2017 .pdf
3,./CAPTUM/CRP/Severity/Baek 2014.pdf
4,./CAPTUM/CRP/Severity/Kasperska-Zajac 2015.pdf


## Check data for duplicate entries
We can identify duplicate pdfs by computing the checksum of each file and then counting the unique values. So let us define the checksum function `get_checksum()`:

In [13]:
# https://stackoverflow.com/questions/16874598/how-do-i-calculate-the-md5-checksum-of-a-file-in-python#16876405
import hashlib

def get_checksum(filepath: str) -> str:
    # Open,close, read file and calculate MD5 on its contents 
    with open(filepath, 'rb') as file_to_check:
        # read contents of the file
        data = file_to_check.read()    
        # pipe contents of the file through
        return hashlib.md5(data).hexdigest()

# check that it works
file_one, file_one_copy, file_two = "./pdf_1.pdf", "./pdf_1 copy.pdf", "./pdf_2.pdf"
assert get_checksum(file_one) == get_checksum(file_one_copy), "should be equal"
assert get_checksum(file_one) != get_checksum(file_two), "should not be equal"

Then we can create a pandas dataframe from the list of filepath's and also add a checksum column that is computed using our `get_checksum()` function.

In [14]:
df['checksum'] = df['filepath'].map(get_checksum)
df.head()

Unnamed: 0,filepath,checksum
0,./CAPTUM/CRP/ANA/Asero 2017.pdf,2fad223ae2232cb9e855d3ece9e34b72
1,./CAPTUM/CRP/ANA/Magen 2015.pdf,c721aaea67a47811324b3c860dde612b
2,./CAPTUM/CRP/Severity/Kolkhir 2017 .pdf,aed2cb292fdffefe2a319b9d7e517bb3
3,./CAPTUM/CRP/Severity/Baek 2014.pdf,989e3eca08259c9a898acc551473f55f
4,./CAPTUM/CRP/Severity/Kasperska-Zajac 2015.pdf,2ed156f4fd5cfa00198f3f6f590940e0


In the final step, we can analyse the results of this activity. It seems that our available data is in reality only half as large as it initially appears.

In [15]:
print('Total number of pdfs: {}'.format(df['checksum'].count()))
print('Total number of unique pdfs: {}'.format(len(df['checksum'].unique())))
df['checksum']


Total number of pdfs: 1047
Total number of unique pdfs: 464


0       2fad223ae2232cb9e855d3ece9e34b72
1       c721aaea67a47811324b3c860dde612b
2       aed2cb292fdffefe2a319b9d7e517bb3
3       989e3eca08259c9a898acc551473f55f
4       2ed156f4fd5cfa00198f3f6f590940e0
                      ...               
1042    fb22292adf8f35656fde0e54dc0cee51
1043    6a5635468c99716fc18b91b7b6ebaeaf
1044    6cfd7540663be0f6d7fb72f776339b71
1045    849adffe6101df0a030cf425f661e1ed
1046    f13be81ffbff55e031a34ef81d43cbff
Name: checksum, Length: 1047, dtype: object

Now we create a df of unique pdfs by removing duplicate checksums

In [16]:
df_unique = df.drop_duplicates(subset=['checksum'])
df_unique.head()

Unnamed: 0,filepath,checksum
0,./CAPTUM/CRP/ANA/Asero 2017.pdf,2fad223ae2232cb9e855d3ece9e34b72
1,./CAPTUM/CRP/ANA/Magen 2015.pdf,c721aaea67a47811324b3c860dde612b
2,./CAPTUM/CRP/Severity/Kolkhir 2017 .pdf,aed2cb292fdffefe2a319b9d7e517bb3
3,./CAPTUM/CRP/Severity/Baek 2014.pdf,989e3eca08259c9a898acc551473f55f
4,./CAPTUM/CRP/Severity/Kasperska-Zajac 2015.pdf,2ed156f4fd5cfa00198f3f6f590940e0


# Extracting the text
The next step is to read the text from the pdfs. We will do this using Optical Character Recognition (OCR)

## Identify the document language based on a sample of pages

In [17]:
df_unique['lang'] = df_unique['filepath'].map(get_language)
df_unique.lang = df_unique.lang.map({'en':'eng','de':'deu'})
df_unique = df_unique.sort_values(by='lang')
df_unique.reset_index(drop=True, inplace=True)
df_unique.head()

tection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detection after 1 pages.
Reached required number of words for language detectio

Unnamed: 0,filepath,checksum,lang
0,./CAPTUM/CRP/ANA/Asero 2017.pdf,2fad223ae2232cb9e855d3ece9e34b72,eng
1,./CAPTUM/Allergic Diseases/Omalizumab/Llanos 2...,eef921176bb03136228aefffb9727ae3,eng
2,./CAPTUM/Allergic Diseases/Omalizumab/Clark 20...,c947cef954a9796cdc2bab0cd8507874,eng
3,./CAPTUM/Allergic Diseases/Omalizumab/Damask 2...,3f5cbea57ace7d351641e634c7008478,eng
4,./CAPTUM/Allergic Diseases/Omalizumab/Palacios...,a39d7763465b87b81a72259037b3c158,eng


### Running OCR on the images

In [21]:
# check optimal number of threads with tesser_perf.py
threads = 8

for index, row in df_unique.iterrows():
    text, number_of_pages = ocr_pdf(row.filepath, row.lang ,threads)
    df_unique.loc[index, 'text'] = text
    df_unique.loc[index, 'number_of_pages'] = number_of_pages

df_unique.number_of_pages = df_unique.number_of_pages.astype(int)   # is decimal otherwise
print(df_unique.head())
df_unique.to_csv('captum.csv')

rage of 1.2425532937049866 seconds per page.
Starting OCR for file Kyriakou 2018.pdf
OCR finished in 2.7095561027526855 seconds with an average of 0.30106178919474286 seconds per page.
Starting OCR for file Salman 2019  .pdf
OCR finished in 7.834311008453369 seconds with an average of 0.3561050458387895 seconds per page.
Starting OCR for file Maurer 2013.pdf
OCR finished in 11.13097858428955 seconds with an average of 0.9275815486907959 seconds per page.
Starting OCR for file Sanchez 2018.pdf
OCR finished in 6.654712438583374 seconds with an average of 0.9506732055119106 seconds per page.
Starting OCR for file Nam 2012.pdf
OCR finished in 5.362809181213379 seconds with an average of 1.0725618362426759 seconds per page.
Starting OCR for file Kulthanan 2017 .pdf
OCR finished in 30.386348724365234 seconds with an average of 4.340906960623605 seconds per page.
Starting OCR for file Asero 2018.pdf
OCR finished in 2.859804630279541 seconds with an average of 0.408543518611363 seconds per pag

TypeError: Expected bytes, got float