In [78]:
############################################################
# Author: Amin Boroomand, March 2023
#Owner: Symbiosis.vc
# This code aims to find investment names using different 
# PdF readers and different NLP packages to find which is
# accurate in finding investment name
############################################################

In [79]:
#imports
import os
import PyPDF2
import pdfminer
import pdfrw
import spacy
import pandas as pd
from stanfordnlp.server import CoreNLPClient
import os
from pdfCropMargins import crop


In [80]:
#setting CORENLP_HOME environment variable
corenlp_home_path = '/Users/amin/Desktop/33/code_test/stanford-corenlp-4.5.3'
os.environ['CORENLP_HOME'] = corenlp_home_path

In [81]:
#reading files with different PDF reader packages
def read_pdf_file(pdf_reader, file_path):
    if pdf_reader == 'PyPDF2':
        return read_pdf_PyPDF2(file_path)
    elif pdf_reader == 'pdfminer':
        return read_pdf_pdfminer(file_path)
    elif pdf_reader == 'pdfrw':
        return read_pdf_pdfrw(file_path)
    
def read_pdf_PyPDF2(file_path):
    pdf_file = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    text = ''
    for page_num in range(pdf_reader.numPages):
        text += pdf_reader.getPage(page_num).extractText()
    pdf_file.close()
    return text

def read_pdf_pdfminer(file_path):
    text = extract_text(file_path)
    return text


def read_pdf_pdfrw(file_path):
    pdf = pdfrw.PdfReader(file_path)
    text = ''
    for page in pdf.pages:
        try:
            text += page.Contents.stream.decode()
        except (AttributeError, TypeError, pdfrw.errors.PdfParseError):
            pass
    return text

In [82]:
#cleaning

def clean_text(text):
    # Remove header/footer
    text = re.sub(r"Header:\s.*", "", text, flags=re.MULTILINE)
    text = re.sub(r"Footer:\s.*", "", text, flags=re.MULTILINE)

    # Remove page numbers and unnecessary information
    text = re.sub(r"Page\s\d+", "", text)
    text = re.sub(r"\n\d+\n", "", text)

    return text

#corp margine

def crop_pdf_margins(file_path):
    cropped_file_path = file_path.replace('.pdf', '_cropped.pdf')
    crop(["-o", cropped_file_path, file_path, "--margins", "20,20,60,20"])
    return cropped_file_path

In [83]:
#find company name using different NLP tools
def extract_company_name(nlp_package, text):
    if nlp_package == 'SpaCy':
        return extract_company_name_spacy(text)
    elif nlp_package == 'Stanford NLP':
        return extract_company_name_stanford(text)


def extract_company_name_spacy(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            return ent.text
    return None

def extract_company_name_stanford(text):
    with CoreNLPClient(annotators=['ner'], timeout=600000,
                       memory='4G', stdout='corenlp_stdout.log',
                       stderr='corenlp_stderr.log') as client:
        annotated = client.annotate(text)
        for sentence in annotated.sentence:
            for token in sentence.token:
                if token.ner == 'ORGANIZATION':
                    return token.word
    return None

In [84]:
#running the code
def main():
    pdf_readers = ['PyPDF2', 'pdfminer', 'pdfrw']
    #nlp_packages = ['SpaCy', 'Stanford NLP']
    nlp_packages = ['SpaCy']
    pdf_folder = '/Users/amin/Desktop/33/test'

    data = {'File_name': [], 'PyPDF2_SpaCy': [], 'PyPDF2_Stanford NLP': [], 'pdfminer_SpaCy': [], 'pdfminer_Stanford NLP': [], 'pdfrw_SpaCy': [], 'pdfrw_Stanford NLP': []}
    df = pd.DataFrame(data)

    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, file)
            file_name = file[:-4]
            df = df.append({'File_name': file_name}, ignore_index=True)

            for pdf_reader in pdf_readers:
                text = read_pdf_file(pdf_reader, file_path)
                text = clean_text(text)
                for nlp_package in nlp_packages:
                    company_name = extract_company_name(nlp_package, text)
                    column_name = f"{pdf_reader}_{nlp_package}"
                    df.loc[df['File_name'] == file_name, column_name] = company_name

    print(df)
    df.to_csv('names_test_NLP.csv', index=False)

if __name__ == "__main__":
    main()


Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
FloatObject (b'0.00-5677566') invalid; use 0.0 instead
Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'18' b'0'
Superfluous whitespace found in object header b'41' b'0'
Superfluous whitespace found in object header b'61' b'0'
Superfluous whitespace found in object header b'206' b'0'
Superfluous whitespace found in object header b'214' b'0'
Superfluous whitespace found in object header b'231' b'0'
Superfluous whitespace found in object header b'234' b'0'
Superfluous whitespace found in object header b'241' b'0'
Superfluous whitespace found in object header b'254' b'0'
Superfluous whitespace found in object header b'257' b'0'
Superfluous whitespace found in object header b'267' b'0'


Superfluous whitespace found in object header b'135' b'0'
Superfluous whitespace found in object header b'136' b'0'
Superfluous whitespace found in object header b'137' b'0'
Superfluous whitespace found in object header b'138' b'0'
Superfluous whitespace found in object header b'139' b'0'
Superfluous whitespace found in object header b'140' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'142' b'0'
Superfluous whitespace found in object header b'143' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'145' b'0'
Superfluous whitespace found in object header b'146' b'0'
Superfluous whitespace found in object header b'147' b'0'
Superfluous whitespace found in object header b'148' b'0'
Superfluous whitespace found in object header b'149' b'0'
Superfluous whitespace found in object header b'150' b'0'
Superfluous whitespace found in object header b'151' b'0'
Superfluous wh

Superfluous whitespace found in object header b'298' b'0'
Superfluous whitespace found in object header b'297' b'0'
Superfluous whitespace found in object header b'296' b'0'
Superfluous whitespace found in object header b'302' b'0'
Superfluous whitespace found in object header b'301' b'0'
Superfluous whitespace found in object header b'300' b'0'
Superfluous whitespace found in object header b'349' b'0'
Superfluous whitespace found in object header b'312' b'0'
Superfluous whitespace found in object header b'311' b'0'
Superfluous whitespace found in object header b'310' b'0'
Superfluous whitespace found in object header b'348' b'0'
Superfluous whitespace found in object header b'304' b'0'
Superfluous whitespace found in object header b'314' b'0'
Superfluous whitespace found in object header b'316' b'0'
Superfluous whitespace found in object header b'318' b'0'
Superfluous whitespace found in object header b'320' b'0'
Superfluous whitespace found in object header b'322' b'0'
Superfluous wh

Superfluous whitespace found in object header b'522' b'0'
Superfluous whitespace found in object header b'523' b'0'
Superfluous whitespace found in object header b'524' b'0'
Superfluous whitespace found in object header b'525' b'0'
Superfluous whitespace found in object header b'526' b'0'
Superfluous whitespace found in object header b'527' b'0'
Superfluous whitespace found in object header b'528' b'0'
Superfluous whitespace found in object header b'529' b'0'
Superfluous whitespace found in object header b'530' b'0'
Superfluous whitespace found in object header b'531' b'0'
Superfluous whitespace found in object header b'532' b'0'
Superfluous whitespace found in object header b'533' b'0'
Superfluous whitespace found in object header b'534' b'0'
Superfluous whitespace found in object header b'535' b'0'
Superfluous whitespace found in object header b'536' b'0'
Superfluous whitespace found in object header b'537' b'0'
Superfluous whitespace found in object header b'538' b'0'
Superfluous wh

Superfluous whitespace found in object header b'658' b'0'
Superfluous whitespace found in object header b'659' b'0'
Superfluous whitespace found in object header b'660' b'0'
Superfluous whitespace found in object header b'661' b'0'
Superfluous whitespace found in object header b'662' b'0'
Superfluous whitespace found in object header b'663' b'0'
Superfluous whitespace found in object header b'664' b'0'
Superfluous whitespace found in object header b'665' b'0'
Superfluous whitespace found in object header b'666' b'0'
Superfluous whitespace found in object header b'667' b'0'
Superfluous whitespace found in object header b'668' b'0'
Superfluous whitespace found in object header b'669' b'0'
Superfluous whitespace found in object header b'670' b'0'
Superfluous whitespace found in object header b'671' b'0'
Superfluous whitespace found in object header b'672' b'0'
Superfluous whitespace found in object header b'673' b'0'
Superfluous whitespace found in object header b'674' b'0'
Superfluous wh

                                            File_name  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             

In [85]:

# get the current working directory
cwd = os.getcwd()

# print the current working directory
print("Current working directory:", cwd)

Current working directory: /Users/amin/Desktop/33/code_test
