In [86]:
###############################################################################
# Author: Amin Boroomand, March 2023
#Owner: Symbiosis.vc
# This code aims to find investment names using different 
# PdF readers and different NLP packages to find which is
# accurate in finding investment name. In this code we cropped PDF margines
################################################################################

In [87]:
#imports
import os
import subprocess
import PyPDF2
import pdfminer
import pdfrw
import spacy
import pandas as pd
from stanfordnlp.server import CoreNLPClient
import os
from pdfCropMargins import crop
import re
from pdfminer.high_level import extract_text
import subprocess

In [88]:
#setting CORENLP_HOME environment variable
corenlp_home_path = '/Users/amin/Desktop/33/code_test/stanford-corenlp-4.5.3'
os.environ['CORENLP_HOME'] = corenlp_home_path

In [89]:
#reading files with different PDF reader packages
def read_pdf_file(pdf_reader, file_path):
    if pdf_reader == 'PyPDF2':
        return read_pdf_PyPDF2(file_path)
    elif pdf_reader == 'pdfminer':
        return read_pdf_pdfminer(file_path)
    elif pdf_reader == 'pdfrw':
        return read_pdf_pdfrw(file_path)
    
def read_pdf_PyPDF2(file_path):
    pdf_file = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    text = ''
    for page_num in range(pdf_reader.numPages):
        text += pdf_reader.getPage(page_num).extractText()
    pdf_file.close()
    return text

def read_pdf_pdfminer(file_path):
    text = extract_text(file_path)
    return text


def read_pdf_pdfrw(file_path):
    pdf = pdfrw.PdfReader(file_path)
    text = ''
    for page in pdf.pages:
        try:
            text += page.Contents.stream.decode()
        except (AttributeError, TypeError, pdfrw.errors.PdfParseError):
            pass
    return text

In [93]:
#cleaning

def clean_text(text):
    # Remove header/footer
    text = re.sub(r"Header:\s.*", "", text, flags=re.MULTILINE)
    text = re.sub(r"Footer:\s.*", "", text, flags=re.MULTILINE)

    # Remove page numbers and unnecessary information
    text = re.sub(r"Page\s\d+", "", text)
    text = re.sub(r"\n\d+\n", "", text)

    return text

#corp margine
def crop_pdf_margins(file_path, pdf_folder):
    cropped_folder = os.path.join(pdf_folder, "cropped")
    os.makedirs(cropped_folder, exist_ok=True)
    file_name = os.path.basename(file_path)
    cropped_file_name = file_name[:-4] + "_cropped.pdf"
    cropped_file_path = os.path.join(cropped_folder, cropped_file_name)

    cmd = f'pdf-crop-margins -o "{cropped_file_path}" -all -p 1 -a4 -40 -40 -120 -40 "{file_path}"'
    
    try:
        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True)
        output = result.stdout
        error = result.stderr
        print("Output:", output)
        print("Error:", error)
        
        print(f"Cropping: {file_path}")
        print(f"Saving cropped file to: {cropped_file_path}")
        
        if not os.path.exists(cropped_file_path):
            print("Cropped file not found. Please check the cropping process.")
            return file_path

    except Exception as e:
        print(f"Error cropping {file_path}: {str(e)}")
        return file_path

    return cropped_file_path


In [94]:
#find company name using different NLP tools
def extract_company_name(nlp_package, text):
    if nlp_package == 'SpaCy':
        return extract_company_name_spacy(text)
    elif nlp_package == 'Stanford NLP':
        return extract_company_name_stanford(text)


def extract_company_name_spacy(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            return ent.text
    return None

def extract_company_name_stanford(text):
    with CoreNLPClient(annotators=['ner'], timeout=600000,
                       memory='4G', stdout='corenlp_stdout.log',
                       stderr='corenlp_stderr.log') as client:
        annotated = client.annotate(text)
        for sentence in annotated.sentence:
            for token in sentence.token:
                if token.ner == 'ORGANIZATION':
                    return token.word
    return None

In [95]:
def main():
    pdf_readers = ['PyPDF2', 'pdfminer', 'pdfrw']
    nlp_packages = ['SpaCy']
    pdf_folder = '/Users/amin/Desktop/33/test'

    data = {'File_name': [], 'PyPDF2_SpaCy': [], 'pdfminer_SpaCy': [], 'pdfrw_SpaCy': []}
    df = pd.DataFrame(data)

    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, file)
            cropped_file_path = crop_pdf_margins(file_path, pdf_folder)
            file_name = file[:-4]
            df = df.append({'File_name': file_name}, ignore_index=True)

            for pdf_reader in pdf_readers:
                text = read_pdf_file(pdf_reader, cropped_file_path)
                text = clean_text(text)
                for nlp_package in nlp_packages:
                    company_name = extract_company_name(nlp_package, text)
                    column_name = f"{pdf_reader}_{nlp_package}"
                    df.loc[df['File_name'] == file_name, column_name] = company_name

    print(df)
    df.to_csv('names_test_NLP_cropp.csv', index=False)

if __name__ == "__main__":
    main()


Output: 
Error: 
Usage: pdf-crop-margins [-h] [-o OUTFILE_PATH_OR_DIR] [-v] [-gui] [-p PCT] [-p4 PCT PCT PCT PCT] [-pt] [-a BP] [-a4 BP BP BP BP] [-cs] [-csm4 BP BP BP BP] [-ap BP] [-ap4 BP BP BP BP] [-u] [-m INT]
                        [-m4 INT INT INT INT] [-mp INT] [-s] [-ms INT] [-e] [-g PAGESTR] [-c [d|m|p|gr|gb|o]] [-gs] [-gsr] [-t BYTEVAL] [-nb INT] [-ns INT] [-x DPI] [-y DPI] [-sr STR] [-gf INT]
                        [-b [m|c|t|a|b]] [-f [m|c|t|a|b]] [-r] [-A] [-gsf] [-nc] [-pv PROG] [-mo] [-q] [-nco] [-pf] [-sc STR] [-su STR] [-ss STR] [-pw PASSWD] [-pc] [-khc] [-kvc] [-spr FLOAT:FLOAT]
                        [-prw FLOAT FLOAT FLOAT FLOAT] [-dcb STR] [-dcw STR] [-i] [-pdl] [-gsp PATH] [-ppp PATH] [--version] [-wcdf FILEPATH]
                        PDF_FILE [PDF_FILE ...]
Error in pdf-crop-margins (pdfCropMargins): argument -a/--absoluteOffset: invalid float value: 'll'

Cropping: /Users/amin/Desktop/33/test/Zehna Corporate Deck SymBiosis.pdf
Saving cropped file to: /Users


Cropping: /Users/amin/Desktop/33/test/G_ACET_IOC_4.8.21.pdf
Saving cropped file to: /Users/amin/Desktop/33/test/cropped/G_ACET_IOC_4.8.21_cropped.pdf
Output: 
Error: 
Usage: pdf-crop-margins [-h] [-o OUTFILE_PATH_OR_DIR] [-v] [-gui] [-p PCT] [-p4 PCT PCT PCT PCT] [-pt] [-a BP] [-a4 BP BP BP BP] [-cs] [-csm4 BP BP BP BP] [-ap BP] [-ap4 BP BP BP BP] [-u] [-m INT]
                        [-m4 INT INT INT INT] [-mp INT] [-s] [-ms INT] [-e] [-g PAGESTR] [-c [d|m|p|gr|gb|o]] [-gs] [-gsr] [-t BYTEVAL] [-nb INT] [-ns INT] [-x DPI] [-y DPI] [-sr STR] [-gf INT]
                        [-b [m|c|t|a|b]] [-f [m|c|t|a|b]] [-r] [-A] [-gsf] [-nc] [-pv PROG] [-mo] [-q] [-nco] [-pf] [-sc STR] [-su STR] [-ss STR] [-pw PASSWD] [-pc] [-khc] [-kvc] [-spr FLOAT:FLOAT]
                        [-prw FLOAT FLOAT FLOAT FLOAT] [-dcb STR] [-dcw STR] [-i] [-pdl] [-gsp PATH] [-ppp PATH] [--version] [-wcdf FILEPATH]
                        PDF_FILE [PDF_FILE ...]
Error in pdf-crop-margins (pdfCropMargins): argument 


Cropping: /Users/amin/Desktop/33/test/CG_GRNA_IOC_12.8.22.pdf
Saving cropped file to: /Users/amin/Desktop/33/test/cropped/CG_GRNA_IOC_12.8.22_cropped.pdf
Output: 
Error: 
Usage: pdf-crop-margins [-h] [-o OUTFILE_PATH_OR_DIR] [-v] [-gui] [-p PCT] [-p4 PCT PCT PCT PCT] [-pt] [-a BP] [-a4 BP BP BP BP] [-cs] [-csm4 BP BP BP BP] [-ap BP] [-ap4 BP BP BP BP] [-u] [-m INT]
                        [-m4 INT INT INT INT] [-mp INT] [-s] [-ms INT] [-e] [-g PAGESTR] [-c [d|m|p|gr|gb|o]] [-gs] [-gsr] [-t BYTEVAL] [-nb INT] [-ns INT] [-x DPI] [-y DPI] [-sr STR] [-gf INT]
                        [-b [m|c|t|a|b]] [-f [m|c|t|a|b]] [-r] [-A] [-gsf] [-nc] [-pv PROG] [-mo] [-q] [-nco] [-pf] [-sc STR] [-su STR] [-ss STR] [-pw PASSWD] [-pc] [-khc] [-kvc] [-spr FLOAT:FLOAT]
                        [-prw FLOAT FLOAT FLOAT FLOAT] [-dcb STR] [-dcw STR] [-i] [-pdl] [-gsp PATH] [-ppp PATH] [--version] [-wcdf FILEPATH]
                        PDF_FILE [PDF_FILE ...]
Error in pdf-crop-margins (pdfCropMargins): argum


Cropping: /Users/amin/Desktop/33/test/2022-3-18 Apeximmune non-confidential Baird.pdf
Saving cropped file to: /Users/amin/Desktop/33/test/cropped/2022-3-18 Apeximmune non-confidential Baird_cropped.pdf
Output: 
Error: 
Usage: pdf-crop-margins [-h] [-o OUTFILE_PATH_OR_DIR] [-v] [-gui] [-p PCT] [-p4 PCT PCT PCT PCT] [-pt] [-a BP] [-a4 BP BP BP BP] [-cs] [-csm4 BP BP BP BP] [-ap BP] [-ap4 BP BP BP BP] [-u] [-m INT]
                        [-m4 INT INT INT INT] [-mp INT] [-s] [-ms INT] [-e] [-g PAGESTR] [-c [d|m|p|gr|gb|o]] [-gs] [-gsr] [-t BYTEVAL] [-nb INT] [-ns INT] [-x DPI] [-y DPI] [-sr STR] [-gf INT]
                        [-b [m|c|t|a|b]] [-f [m|c|t|a|b]] [-r] [-A] [-gsf] [-nc] [-pv PROG] [-mo] [-q] [-nco] [-pf] [-sc STR] [-su STR] [-ss STR] [-pw PASSWD] [-pc] [-khc] [-kvc] [-spr FLOAT:FLOAT]
                        [-prw FLOAT FLOAT FLOAT FLOAT] [-dcb STR] [-dcw STR] [-i] [-pdl] [-gsp PATH] [-ppp PATH] [--version] [-wcdf FILEPATH]
                        PDF_FILE [PDF_FILE ...]
E

FloatObject (b'0.00-5677566') invalid; use 0.0 instead


Output: 
Error: 
Usage: pdf-crop-margins [-h] [-o OUTFILE_PATH_OR_DIR] [-v] [-gui] [-p PCT] [-p4 PCT PCT PCT PCT] [-pt] [-a BP] [-a4 BP BP BP BP] [-cs] [-csm4 BP BP BP BP] [-ap BP] [-ap4 BP BP BP BP] [-u] [-m INT]
                        [-m4 INT INT INT INT] [-mp INT] [-s] [-ms INT] [-e] [-g PAGESTR] [-c [d|m|p|gr|gb|o]] [-gs] [-gsr] [-t BYTEVAL] [-nb INT] [-ns INT] [-x DPI] [-y DPI] [-sr STR] [-gf INT]
                        [-b [m|c|t|a|b]] [-f [m|c|t|a|b]] [-r] [-A] [-gsf] [-nc] [-pv PROG] [-mo] [-q] [-nco] [-pf] [-sc STR] [-su STR] [-ss STR] [-pw PASSWD] [-pc] [-khc] [-kvc] [-spr FLOAT:FLOAT]
                        [-prw FLOAT FLOAT FLOAT FLOAT] [-dcb STR] [-dcw STR] [-i] [-pdl] [-gsp PATH] [-ppp PATH] [--version] [-wcdf FILEPATH]
                        PDF_FILE [PDF_FILE ...]
Error in pdf-crop-margins (pdfCropMargins): argument -a/--absoluteOffset: invalid float value: 'll'

Cropping: /Users/amin/Desktop/33/test/JMP_NXTC_IOC_5.26.20.pdf
Saving cropped file to: /Users/amin/Desk

28                                      Johns Hopkins          NaN  
