### This notebook is to test out elt/etl for reading through all the pdf tarfiles for arxiv and converting them to raw searchable pdfs and then converting that to our training data via pdfplumber

PDFPlumber is used to extract the bounding boxes for characters and words from each pdf and saved as csv.
PDF2Image is then used to save the pdf as .png images. PDF2Image requires the use of conda to install poppler
1. conda create -n pdf python=3.7
2. conda activate pdf
3. conda install -c conda-forge poppler
4. pip install pdf2image

### TO DO
1. convert the following functions into @ray.remote
    2. unzip tars for a given year
    3. move pdf in each unzipped folder to blob
    4. delete pdfs from local

In [1]:
import os, uuid
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from pathlib import Path
import shutil
import time
import glob
import pdfplumber
import pandas as pd
from pdf2image import convert_from_path
from pdf2image.exceptions import (
 PDFInfoNotInstalledError,
 PDFPageCountError,
 PDFSyntaxError
)
import ray
import tarfile
from retry import retry
import sys
import os

'''
For ray to work all the files and modules have to be within the original sys.path.
for some reason you can't append another path and import files from there. Therefore
have to set the PYTHONPATH for each new process. In this case the run_pdfplumber
function is calling src.fixunicode which is in another child directory
'''
os.environ['PYTHONPATH'] = os.path.dirname(os.getcwd())
sys.path.append(os.path.dirname(os.getcwd()))
import config
import src.blob_data_transfer as blob_pull
import src.fixunicode as fixunicode

In [4]:
os.getcwd()

'/home/arxiv/doc_intel_etl/notebooks'

In [3]:
# ray.shutdown()
ray.init()

2020-07-08 04:54:18,424	INFO resource_spec.py:212 -- Starting Ray with 10.89 GiB memory available for workers and up to 5.45 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-08 04:54:18,743	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '172.17.0.2',
 'raylet_ip_address': '172.17.0.2',
 'redis_address': '172.17.0.2:32002',
 'object_store_address': '/tmp/ray/session_2020-07-08_04-54-18_422911_51649/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-07-08_04-54-18_422911_51649/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-07-08_04-54-18_422911_51649'}

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/ray/dashboard/dashboard.py", line 1220, in <module>
    dashboard.run()
  File "/opt/conda/lib/python3.7/site-packages/ray/dashboard/dashboard.py", line 594, in run
    aiohttp.web.run_app(self.app, host=self.host, port=self.port)
  File "/opt/conda/lib/python3.7/site-packages/aiohttp/web.py", line 433, in run_app
    reuse_port=reuse_port))
  File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 587, in run_until_complete
    return future.result()
  File "/opt/conda/lib/python3.7/site-packages/aiohttp/web.py", line 359, in _run_app
    await site.start()
  File "/opt/conda/lib/python3.7/site-packages/aiohttp/web_runner.py", line 104, in start
    reuse_port=self._reuse_port)
  File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1389, in create_server
    % (sa, err.strerror.lower())) from None
OSError: [Errno 99] error while attempting to bind on address ('::1', 8265, 0, 0): cannot assig

In [2]:
# credentials for blob with our raw data
storage_name = config.azure_blob['storage_name']
key = config.azure_blob['key']
connect_str = config.azure_blob['connect_str']

In [3]:
CONTAINER = config.azure_blob['container']
file_type = 'tar'
year_del = 2
prefix = 'arxiv_dl/pdf'

# TODO
1. Need to add a try and exception handling for "unzip_tar"
2. Add proper logging

Data is in the following format:  arxiv_dl/pdf/year/tar_file in the raw data directory
We need to do the following:
Class variables:
1. full_blob_list
2. blob_service_client
3. CONTAINER
4. 

In [4]:
@ray.remote
def unzip_tar(file, tar_path, extract_path):
    
    while not os.path.exists(os.path.join(tar_path, file)):
        time.sleep(0.5)
    if os.path.isfile(os.path.join(tar_path, file)):
        # Can't open empty tar files becuase it produces a read error so need to make sure
        # tar file isn't empty
        try:
            tar = tarfile.open(os.path.join(tar_path, file))
            tar.extractall(path = extract_path)
            tar.close()
        except Exception as e:
            print("Can't open because it's empty: ", file)
    else:
        raise ValueError("%s isn't a file yet" % os.path.join(tar_path, file))

@ray.remote
def move_pdf_to_blob(filepath):
    # Create a blob client using the local file name as the name for the blob
    if filepath is not None:
        pdf_blob_path = filepath.split('/')[0]+'/'+filepath.split('/')[1]+'/'+filepath.split('/')[-1]
        blob_client = blob_service_client.get_blob_client(container=CONTAINER, blob=pdf_blob_path)
        with open(filepath, "rb") as pdf:
            blob_client.upload_blob(pdf)

@retry(tries=10, delay=1)
def open_pdf(filepath):
    try:
        with pdfplumber.open(filepath) as pdf:
            chars = []
            words = []
            text = ""
            for page in range(len(pdf.pages)):
                current_page = pdf.pages[page]
                chars.extend(current_page.chars)
                words.extend(current_page.extract_words(x_tolerance=1, y_tolerance=0))
                text += current_page.extract_text(x_tolerance=1, y_tolerance=0)
        
        # run text through very basic unicode normalization routines before sending back
        text = fixunicode.fix_unicode(text)
        
        return chars, words, text
    except Exception as e:
        print("Error: can't open ", filepath)
        
@ray.remote
def run_pdfplumber(filepath, char_path, word_path, text_path):
    filename = filepath.split('/')[-1]
    
    while not os.path.exists(filepath):
        time.sleep(0.5)
    try:
        chars, words, text = open_pdf(filepath)
        chars_df = pd.DataFrame(chars)
        chars_df.to_csv(char_path+'/'+filename.replace('.pdf','')+'_chars.csv', index=False)   
        words_df = pd.DataFrame(words)
        words_df.to_csv(word_path+'/'+filename.replace('.pdf','')+'_words.csv', index=False)
        
        # write full pdf text to csv
        f = open(text_path+'/'+filename.replace('.pdf','')+'.txt', 'w')
        f.write(text)
        f.close()
        
        return filepath
    except Exception as e:
        print("Can't unpack: ", filepath)
        os.remove(filepath)
    

@ray.remote
def move_csv_to_blob(csv):
    if csv is not None:
        csv_blob_path = '/'.join(csv.split('/')[:4])+'/'+csv.split('/')[-1]
        blob_client = blob_service_client.get_blob_client(container=CONTAINER, blob=csv_blob_path)
        with open(csv, "rb") as csv:
            blob_client.upload_blob(csv)

@ray.remote
def convert_to_image(filepath, image_path):
    if os.path.exists(filepath):
        year = filepath.split('/')[1]
        pdf_name = filepath.split('/')[-1]
        image_list = []
        try:
            images = convert_from_path(filepath)
            for i, image in enumerate(images):
                fname = image_path+"/"+pdf_name.replace('.pdf','')+"_"+str(i)+".png"
                image.save(fname, "PNG")
                image_list.extend(fname)
            return image_list
        except Exception as e:
            print("{} can't be converted to image".format(filepath))
    
@ray.remote
def move_image_to_blob(img):
    if img is not None:
        img_blob_path = '/'.join(img.split('/')[:3])+'/'+img.split('/')[-1]
        blob_client = blob_service_client.get_blob_client(container=CONTAINER, blob=img_blob_path)
        with open(img, "rb") as img:
            blob_client.upload_blob(img)

@ray.remote
def move_txt_to_blob(txt):
    if txt is not None:
        txt_blob_path = '/'.join(txt.split('/')[:4])+'/'+txt.split('/')[-1]
        blob_client = blob_service_client.get_blob_client(container=CONTAINER, blob=txt_blob_path)
        with open(txt, "rb") as txt:
            blob_client.upload_blob(txt)

In [5]:
full_blob_list = blob_pull.get_blob_list(prefix)
pdf_tar_list, year_list = blob_pull.get_blob_file_list(file_type, full_blob_list, year_del)
year_list = ['1991']
year = year_list[0]
# for year in year_list:
tar_path = blob_pull.copy_blob(year, pdf_tar_list, year_del)

extract_path = 'arxiv_pdf/'+year
Path(extract_path).mkdir(parents=True, exist_ok=True)

pdfplumber_path = 'arxiv_training_data/pdfplumber'

char_path = pdfplumber_path+'/chars/'+year
Path(char_path).mkdir(parents=True, exist_ok=True)

word_path = pdfplumber_path+'/words/'+year
Path(word_path).mkdir(parents=True, exist_ok=True)

text_path = pdfplumber_path+'/text/'+year
Path(text_path).mkdir(parents=True, exist_ok=True)

image_path = "arxiv_training_data/pdf_images/"+year
Path(image_path).mkdir(parents=True, exist_ok=True)
    
    # work on all the pdfs for a given year. Each work item is parallized through ray to
    # go through all the pdfs. Work items are run sequentally so we don't run into any io
    # issues
#         plumbed_pdf_list = ray.get([run_pdfplumber.remote(pdf, char_path, word_path, text_path) for pdf in pdf_list])
#         print("Finished pdfplumber for year: ", year)
    
#         ray.get([move_pdf_to_blob.remote(pdf) for pdf in plumbed_pdf_list])
#         print("Moved pdfs to blob")
#         csv_list = []
#         csv_list.extend(glob.glob(word_path+'/*.csv'))
#         csv_list.extend(glob.glob(char_path+'/*.csv'))
#         ray.get([move_csv_to_blob.remote(csv) for csv in csv_list])
#         print("Moved csvs to blob for year: ", year)
#         image_list = []
#         image_list.extend(glob.glob(image_path+'/*.png'))
#         ray.get([move_image_to_blob.remote(img) for img in image_list])
#         print("Moved images to blob")
#         text_list = []
#         text_list.extend(glob.glob(text_path+'/*.txt'))
#         ray.get([move_txt_to_blob.remote(txt) for txt in text_list])
#         print("Moved text files to blob")

#         # delete all the paths and files we saved on local
#         time.sleep(30)
#         shutil.rmtree('arxiv_pdf')
#         shutil.rmtree('arxiv_dl')
#         shutil.rmtree('arxiv_training_data')

In [8]:
# extract all the tar files for a given year
# ray.get([unzip_tar.remote(file, tar_path, extract_path) for file in os.listdir(tar_path)])
# print("Finished unzipping {} tar files".format(len(os.listdir(tar_path))))

# get list of all pdfs for the year that have been extracted
sub_folders = os.listdir(extract_path)
pdf_list = []
for folder in sub_folders:
    pdf_list.extend(glob.glob(extract_path+'/'+folder+'/*.pdf'))

In [10]:
ray.get([convert_to_image.remote(pdf, image_path) for pdf in pdf_list])

[2m[36m(pid=51698)[0m arxiv_pdf/1991/9111/hep-th9111006.pdf can't be converted to image
[2m[36m(pid=51698)[0m arxiv_pdf/1991/9111/hep-th9111056.pdf can't be converted to image
[2m[36m(pid=51698)[0m arxiv_pdf/1991/9111/hep-th9111009.pdf can't be converted to image
[2m[36m(pid=51698)[0m arxiv_pdf/1991/9111/hep-th9111007.pdf can't be converted to image
[2m[36m(pid=51698)[0m arxiv_pdf/1991/9111/hep-th9111037.pdf can't be converted to image
[2m[36m(pid=51700)[0m arxiv_pdf/1991/9111/hep-th9111020.pdf can't be converted to image
[2m[36m(pid=51700)[0m arxiv_pdf/1991/9111/hep-th9111019.pdf can't be converted to image
[2m[36m(pid=51700)[0m arxiv_pdf/1991/9111/hep-th9111002.pdf can't be converted to image
[2m[36m(pid=51699)[0m arxiv_pdf/1991/9111/hep-th9111048.pdf can't be converted to image
[2m[36m(pid=51697)[0m arxiv_pdf/1991/9111/hep-th9111021.pdf can't be converted to image
[2m[36m(pid=51697)[0m arxiv_pdf/1991/9111/hep-th9111057.pdf can't be converted to image

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

[2m[36m(pid=51698)[0m arxiv_pdf/1991/9112/hep-th9112057.pdf can't be converted to image
[2m[36m(pid=51698)[0m arxiv_pdf/1991/9107/hep-lat9107001.pdf can't be converted to image
[2m[36m(pid=51700)[0m arxiv_pdf/1991/9112/hep-th9112070.pdf can't be converted to image
[2m[36m(pid=51700)[0m arxiv_pdf/1991/9112/hep-th9112003.pdf can't be converted to image
[2m[36m(pid=51699)[0m arxiv_pdf/1991/9112/hep-th9112073.pdf can't be converted to image
[2m[36m(pid=51699)[0m arxiv_pdf/1991/9112/hep-th9112042.pdf can't be converted to image
[2m[36m(pid=51697)[0m arxiv_pdf/1991/9112/hep-th9112043.pdf can't be converted to image
[2m[36m(pid=51696)[0m arxiv_pdf/1991/9112/hep-lat9112001.pdf can't be converted to image
[2m[36m(pid=51694)[0m arxiv_pdf/1991/9112/hep-th9112030.pdf can't be converted to image
[2m[36m(pid=51695)[0m arxiv_pdf/1991/9112/hep-th9112005.pdf can't be converted to image
[2m[36m(pid=51695)[0m arxiv_pdf/1991/9107/hep-lat9107002.pdf can't be converted to im

In [29]:
def get_list(path, file_type):
    return glob.glob(path+file_type, recursive=True)

In [31]:
csv_list = get_list(pdfplumber_path, 'word/*.csv')
len(csv_list)

0

In [8]:
import src.blob_data_transfer as blob_pull

In [9]:
[blob_pull.send_to_blob(txt, 4) for txt in text_list[:10]]

[None, None, None, None, None, None, None, None, None, None]

In [19]:
file = text_list[1]
split=4
file_blob_path = '/'.join(file.split('/')[:split])+'/'+file.split('/')[-1]
print(file_blob_path)
# blob_client = blob_service_client.get_blob_client(container=CONTAINER, blob=file_blob_path)
# with open(file, "rb") as f:
#     blob_client.upload_blob(f, overwrite=True)

arxiv_training_data/pdfplumber/text/1991/pdfplumber


In [28]:
pdfplumber_path

'arxiv_training_data/pdfplumber'