In [68]:
import os
import re
from io import StringIO
from pdfminer.high_level import extract_text
from transformers import pipeline
import pandas as pd

In [69]:
def read_pdf_files(folder_path):
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    pdf_texts = []

    for pdf_file in pdf_files:
        text = extract_text(os.path.join(folder_path, pdf_file))
        pdf_texts.append((pdf_file[:-4], text))
    return pdf_texts

In [63]:
def clean_text(text):
    # Remove header, footer, and page number
    text = re.sub(r'\n+|\r+|\t+', ' ', text)
    text = re.sub(r'Page \d+', '', text)
    return text

In [64]:
def find_fin_assoc(text):
    associations = []
    for company, website in [
        ('Goldman Sachs', 'www.goldmansachs.com'),
                             ('JPMorgan Chase', 'www.jpmorganchase.com'),
                             ('Morgan Stanley', 'www.morganstanley.com')]:
        if company in text:
            associations.append(company)
            text = text.replace(company, '').replace(website, '')
    return ', '.join(associations), text

In [65]:
def get_company_name(text):
    nlp_qa = pipeline("question-answering")
    question = "Which company is the company name?"
    answer = nlp_qa(question=question, context=text)
    return answer['answer']

In [66]:
def create_dataframe(pdf_texts):
    data = []

    for file_name, text in pdf_texts:
        text = clean_text(text)
        fin_assoc, text = find_fin_assoc(text)
        company_name = get_company_name(text)

        data.append({
            "File_name": file_name,
            "Company_Name": company_name,
            "Issue": 0,
            "Method": 0,
            "Website": 0,
            "FinAssoc": fin_assoc,
            "OtherCompAssoc": 0,
            "Location": 0,
            "number of words": 0,
            "computational": 0,
            "age": 0,
            "Phase": 0,
            "Bio-Keywords": 0,
            "Human": 0,
            "Treatment": 0,
            "Diagnosis": 0
        })

    df = pd.DataFrame(data)
    return df

In [67]:

def main():
    folder_path = "/Users/amin/Desktop/33/code_test"
    pdf_texts = read_pdf_files(folder_path)
    df = create_dataframe(pdf_texts)
    print(df)

if __name__ == "__main__":
    main()


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production 

                                    File_name              Company_Name  \
0                         PS_TCRR_NOTE_1.5.23    TCR2 Therapeutics, Inc   
1              Grace Sciece Platform (9.2.20)             Grace Science   
2               AcuamarkDx Series A_Corp Deck                ResearchDx   
3                      JPM_SRZN_NOTE_11.28.22          Company-Specific   
4                           G_ACET_IOC_4.8.21     GUGGENHEIM SECURITIES   
5                          B_RXRX_IOC_9.21.21       Jonathan Livescault   
6        Dianomi Non-Confidential Slides - v3      Dianomi Therapeutics   
7                          WB_VOR_IOC_1.25.22       Net Interest Income   
8                        LT_AVXL_NOTE_6.22.21           GROWTH ANALYSIS   
9   Graviton Corporate Presentation July 2022  Graviton Holding Company   
10                        CT one-pager Dec 22     circadiantherapeutics   

    Issue  Method  Website        FinAssoc  OtherCompAssoc  Location  \
0       0       0        0 