In [118]:
import os
import re
import pandas as pd
import numpy as np
from PIL import Image
import concurrent.futures
import queue
import time
import tesserocr
#from multiprocessing import cpu_count, Pool
from pdf2image import convert_from_path
from pdf2image import convert_from_bytes
from langdetect import detect

# Import the source data
The data is provided as a directory that is three levels deep (the third level is ommited in the following listing).
``` bash
fiete@ubu:~/Documents/studium/analyse_semi_und_unstrukturierter_daten$ tree -d -L 1 CAPTUM
CAPTUM
├── Allergic Diseases
├── ANA
├── Angioedema
├── anti-FcεRI
├── Antihistamine
├── Anti-IgE
├── anti-TPO IgE ratio
├── ASST
├── Basophil
├── BAT
├── BHRA
├── CRP
├── Cyclosporine
├── D-Dimer
├── dsDNA
├── Duration
├── Eosinophil
├── IL-24
├── Omalizumab
├── Severity
├── Thyroglobulin
├── Total IgE
└── TPO
```

To work further with the source data, it is useful to have a list of file paths for the pdfs. The following creates a list of all pdf files in the `CAPTUM` source folder.

In [119]:
def get_filepaths(path):
    pdf_filepaths = []
    for root, directories, files in os.walk(path, topdown=False):
        for name in files:
            if name[-4:] == '.pdf':
                pdf_filepaths.append(os.path.join(root, name))
    return pdf_filepaths

def get_language(path):
        text = ''
        with open(path, 'rb') as raw_pdf:
            ocr_entities = convert_from_bytes(raw_pdf.read(), dpi=300, thread_count=4, grayscale=True) # return List[PIL.Image]
        for p in range(0,10):
            if len(text) <= 500:
                    text += tesserocr.image_to_text(ocr_entities[p])
            else:
                print('Reached required number of words for language detection after ' + str(p) + ' pages.')
                break
        return detect(text[:500]) # returns i.e en or de

tesserocr_queue = queue.Queue()

def perform_ocr(img):
    tess_api = None
    try:
        tess_api = tesserocr_queue.get(block=True, timeout=300)
        tess_api.SetImage(img)
        text = tess_api.GetUTF8Text()
        return text
    except tesserocr_queue.Empty:
        print('Empty exception caught!')
        return None
    finally:
        if tess_api is not None:
            tesserocr_queue.put(tess_api)

def run_threaded_ocr_on_pdf(ocr_images, num_threads, language):
    # Setup Queue
    for _ in range(num_threads):
        tesserocr_queue.put(tesserocr.PyTessBaseAPI(lang=language))

    # Perform OCR using ThreadPoolExecutor
    start = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        res = executor.map(perform_ocr, ocr_images)
    end = time.time()

    # Restoring queue
    for _ in range(num_threads):
        api = tesserocr_queue.get(block=True)
        api.End()

    tesserocr_queue.queue.clear()
    return (res, end - start)

def ocr_pdf(filepath, language, threads):
    if (language not in ['eng', 'deu']):
        print(f"Got language code { language }, using 'eng' for ocr.")
        language = 'eng'
    # Pdf to image
    with open(filepath, 'rb') as raw_pdf:
        ocr_entities = convert_from_bytes(raw_pdf.read(), dpi=300, thread_count=4, grayscale=True)

    print(f'Starting OCR for file { os.path.basename(filepath) }')
    result_iterator, total_time = run_threaded_ocr_on_pdf(ocr_entities, threads, language)

    text = ''
    number_of_pages = 0
    for item in result_iterator:
        text += item
        number_of_pages += 1
    
    # print(f'OCR finished in {str(total_time)} seconds with an average of {str(total_time / number_of_pages)} seconds per page.')
    return (text, number_of_pages)

def clean_text(text: str) -> str:
    # https://www.kaggle.com/arijzou/text-preprocessing-disaster-tweets
    url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    # https://www.emailregex.com/
    email_pattern = r'''(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'''
    text = text.replace("\n"," ")                   # new lines
    text = " ".join(text.split())                   # consecutive spaces
    #text = re.sub(url_pattern, '', text)            # urls
    #text = re.sub(email_pattern, '', text)          # e-mails
    #text = re.sub(r'\[[0-9]+\]','', text)           # quotation references
    #text = re.sub(r'[:°<>,="”~{}()!\[\]]','', text) # meaningless characters
    #text = text.replace(':', '')                    # colons
    text = text.lower()                             # turn lowercase
    return text

## Initialize dataframe with pdf filepaths

In [120]:
root_dir = './CAPTUM'
captum_csv = 'captum.csv'

# make this notebook rerunnable (idempotent)
if os.path.isfile(captum_csv):
    captum = pd.read_csv(captum_csv, index_col=0)
else:
    captum = pd.DataFrame(get_filepaths(root_dir), columns = ['filepath'])

captum.head()

Unnamed: 0,filepath,checksum,lang,text,number_of_pages
0,./CAPTUM/CRP/ANA/Asero 2017.pdf,2fad223ae2232cb9e855d3ece9e34b72,eng,Clinical Allergology - Original Paper\n\nInter...,5
1,./CAPTUM/Allergic Diseases/Omalizumab/Palacios...,a39d7763465b87b81a72259037b3c158,eng,1duosnuey\ soulny 1duosnueyy souiny 1duosnuey\...,7
2,./CAPTUM/Allergic Diseases/Omalizumab/Incorvai...,eac49b8d8cc3fbb1bf253eb33303e599,eng,"Drug Design, Development and Therapy\n\n8\n\nD...",11
3,./CAPTUM/Allergic Diseases/Omalizumab/Cusack 2...,5c9c87edcc49054b5710fec3f498777d,eng,"QJM Advance Access published March 9, 2016\n\n...",14
4,./CAPTUM/Allergic Diseases/Omalizumab/Ke 2018.pdf,f1f6e0f31707f860336c9ba7d2d49567,eng,aT ee ee\n\nReal-World Characteristics and Tre...,11


## Check data for duplicate entries
We can identify duplicate pdfs by computing the checksum of each file and then counting the unique values. So let us define the checksum function `get_checksum()`:

In [121]:
# https://stackoverflow.com/questions/16874598/how-do-i-calculate-the-md5-checksum-of-a-file-in-python#16876405
import hashlib

def get_checksum(filepath: str) -> str:
    # Open,close, read file and calculate MD5 on its contents 
    with open(filepath, 'rb') as file_to_check:
        # read contents of the file
        data = file_to_check.read()    
        # pipe contents of the file through
        return hashlib.md5(data).hexdigest()

# check that it works
file_one, file_one_copy, file_two = "./pdf_1.pdf", "./pdf_1 copy.pdf", "./pdf_2.pdf"
assert get_checksum(file_one) == get_checksum(file_one_copy), "should be equal"
assert get_checksum(file_one) != get_checksum(file_two), "should not be equal"

Then we can create a pandas dataframe from the list of filepath's and also add a checksum column that is computed using our `get_checksum()` function.

In [122]:
if 'checksum' not in captum.columns:
    captum['checksum'] = captum['filepath'].map(get_checksum)
    captum.head()
else:
    print('Skipped cell')

Skipped cell


In the final step, we can analyse the results of this activity. It seems that our available data is in reality only half as large as it initially appears.

In [123]:
print('Total number of pdfs: {}'.format(captum['checksum'].count()))
print('Total number of unique pdfs: {}'.format(len(captum['checksum'].unique())))
captum['checksum']


Total number of pdfs: 464
Total number of unique pdfs: 464


0      2fad223ae2232cb9e855d3ece9e34b72
1      a39d7763465b87b81a72259037b3c158
2      eac49b8d8cc3fbb1bf253eb33303e599
3      5c9c87edcc49054b5710fec3f498777d
4      f1f6e0f31707f860336c9ba7d2d49567
                     ...               
459    618e7a3878680f56615727f92a241794
460    bcdf98982a6308d4c6ff11b2568a1da2
461    64c79e3174b21fff8317ed5ebcf94c5c
462    7ee96581891eaf2761ebffab6ef05491
463    1eba773a39a5b01bfef1a04337cfce7e
Name: checksum, Length: 464, dtype: object

Now we create a df of unique pdfs by removing duplicate checksums

In [124]:
captum = captum.drop_duplicates(subset=['checksum'])
captum.head()

Unnamed: 0,filepath,checksum,lang,text,number_of_pages
0,./CAPTUM/CRP/ANA/Asero 2017.pdf,2fad223ae2232cb9e855d3ece9e34b72,eng,Clinical Allergology - Original Paper\n\nInter...,5
1,./CAPTUM/Allergic Diseases/Omalizumab/Palacios...,a39d7763465b87b81a72259037b3c158,eng,1duosnuey\ soulny 1duosnueyy souiny 1duosnuey\...,7
2,./CAPTUM/Allergic Diseases/Omalizumab/Incorvai...,eac49b8d8cc3fbb1bf253eb33303e599,eng,"Drug Design, Development and Therapy\n\n8\n\nD...",11
3,./CAPTUM/Allergic Diseases/Omalizumab/Cusack 2...,5c9c87edcc49054b5710fec3f498777d,eng,"QJM Advance Access published March 9, 2016\n\n...",14
4,./CAPTUM/Allergic Diseases/Omalizumab/Ke 2018.pdf,f1f6e0f31707f860336c9ba7d2d49567,eng,aT ee ee\n\nReal-World Characteristics and Tre...,11


# Extracting the text
The next step is to read the text from the pdfs. We will do this using Optical Character Recognition (OCR)

## Identify the document language based on a sample of pages

In [125]:
if 'lang' not in captum.columns:
    captum['lang'] = captum['filepath'].map(get_language)
    captum.lang = captum.lang.replace({'en':'eng','de':'deu'})
    captum = captum.sort_values(by='lang')
    captum.reset_index(drop=True, inplace=True)
    captum.head()
    captum.to_csv(captum_csv)
else:
    print('Skipped cell')

Skipped cell


### Running OCR on the images

In [126]:
if 'text' not in captum.columns:
    # check optimal number of threads with tesser_perf.py
    threads = 8

    for index, row in captum.iterrows():
        text, number_of_pages = ocr_pdf(row.filepath, row.lang ,threads)
        captum.loc[index, 'text'] = text
        captum.loc[index, 'number_of_pages'] = number_of_pages

    captum.number_of_pages = captum.number_of_pages.astype(int)   # is decimal otherwise
    print(captum.head())
    captum.to_csv(captum_csv)

In [127]:
captum.head()
captum.text

0      Clinical Allergology - Original Paper\n\nInter...
1      1duosnuey\ soulny 1duosnueyy souiny 1duosnuey\...
2      Drug Design, Development and Therapy\n\n8\n\nD...
3      QJM Advance Access published March 9, 2016\n\n...
4      aT ee ee\n\nReal-World Characteristics and Tre...
                             ...                        
459    10\n\n11\n\n12\n\n13\n\n14\n\n15\n\n16\n\n17\n...
460    ®\n\nCheck for\nDOI: 10.1111/j)dv.15350 JEADV ...
461    Clinical & Experimental Immunology\n\nThe Jour...
462    van den Elzen et al. Clin Trans! Allergy (2017...
463    Atopic dermatitis and skin disease\n\nExpressi...
Name: text, Length: 464, dtype: object

## Clean the text
Remove
- New lines (\n)
- Consecutive spaces

In [128]:
captum = pd.read_csv(captum_csv)
for index,row in captum.iterrows():
    captum.loc[index, 'text'] = clean_text(row['text'])
captum.head()

373/464 length: 28003
374/464 length: 18107
375/464 length: 24861
376/464 length: 35520
377/464 length: 29231
378/464 length: 36434
379/464 length: 25278
380/464 length: 29587
381/464 length: 39840
382/464 length: 28636
383/464 length: 24620
384/464 length: 27676
385/464 length: 39193
386/464 length: 34395
387/464 length: 25398
388/464 length: 15265
389/464 length: 21285
390/464 length: 32929
391/464 length: 27501
392/464 length: 11943
393/464 length: 13338
394/464 length: 51434
395/464 length: 17129
396/464 length: 42356
397/464 length: 11818
398/464 length: 44714
399/464 length: 56347
400/464 length: 42234
401/464 length: 24002
402/464 length: 49265
403/464 length: 29505
404/464 length: 23799
405/464 length: 46249
406/464 length: 44414
407/464 length: 46480
408/464 length: 26351
409/464 length: 38962
410/464 length: 52752
411/464 length: 17538
412/464 length: 48409
413/464 length: 15804
414/464 length: 44230
415/464 length: 45088
416/464 length: 29374
417/464 length: 30315
418/464 le

Unnamed: 0.1,Unnamed: 0,filepath,checksum,lang,text,number_of_pages
0,0,./CAPTUM/CRP/ANA/Asero 2017.pdf,2fad223ae2232cb9e855d3ece9e34b72,eng,clinical allergology - original paper internat...,5
1,1,./CAPTUM/Allergic Diseases/Omalizumab/Palacios...,a39d7763465b87b81a72259037b3c158,eng,1duosnuey\ soulny 1duosnueyy souiny 1duosnuey\...,7
2,2,./CAPTUM/Allergic Diseases/Omalizumab/Incorvai...,eac49b8d8cc3fbb1bf253eb33303e599,eng,"drug design, development and therapy 8 dove re...",11
3,3,./CAPTUM/Allergic Diseases/Omalizumab/Cusack 2...,5c9c87edcc49054b5710fec3f498777d,eng,"qjm advance access published march 9, 2016 qua...",14
4,4,./CAPTUM/Allergic Diseases/Omalizumab/Ke 2018.pdf,f1f6e0f31707f860336c9ba7d2d49567,eng,at ee ee real-world characteristics and treatm...,11


In [129]:
captum.to_csv(captum_csv)