In [1]:
import os
import PyPDF2
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

def read_pdf(file_path):
    """
    Read a PDF file and return its text content using PyPDF2.

    Args:
        file_path (str): The path of the PDF file.

    Returns:
        str: The text content of the PDF file.
    """
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        text = ''
        for page_num in range(pdf_reader.getNumPages()):
            text += pdf_reader.getPage(page_num).extractText()
    return text

def extract_company_name(text):
    """
    Use the deepset/roberta-base-squad2 model to extract the company name from the text.

    Args:
        text (str): The text content to analyze.

    Returns:
        str: The company name.
    """
    tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
    model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

    question = "In maximum 3 words, What is the name of the subject bio-based company?"
    inputs = tokenizer(question, text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    answer_start = outputs.start_logits.argmax(dim=-1).item()
    answer_end = outputs.end_logits.argmax(dim=-1).item() + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

    return answer.strip()

def process_pdf_files(directory):
    """
    Read PDF files in a directory, extract company names, and build a pandas DataFrame.

    Args:
        directory (str): The path of the directory containing the PDF files.

    Returns:
        pd.DataFrame: A pandas DataFrame with the file names and company names.
    """
    data = []

    for file_name in os.listdir(directory):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(directory, file_name)
            text = read_pdf(file_path)
            company_name = extract_company_name(text)
            file_name_without_extension = file_name[:-4]  # Remove ".pdf" from the file name
            data.append({'File Name': file_name_without_extension, 'Company Name': company_name})

    df = pd.DataFrame(data)
    return df

# Set the path of the directory containing the PDF files
pdf_directory = '/Users/amin/Desktop/33/test'

# Process the PDF files and get a pandas DataFrame
result_df = process_pdf_files(pdf_directory)
print(result_df)

# Save the DataFrame to a CSV file
result_df.to_csv('Name_Finder_PyPDF2_NoCln_QA2_prmt2.csv', index=False)


Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
FloatObject (b'0.00-5677566') invalid; use 0.0 instead
Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'18' b'0'
Superfluous whitespace found in object header b'41' b'0'
Superfluous whitespace found in object header b'61' b'0'
Superfluous whitespace found in object header b'206' b'0'
Superfluous whitespace found in object header b'214' b'0'
Superfluous whitespace found in object header b'231' b'0'
Superfluous whitespace found in object header b'234' b'0'
Superfluous whitespace found in object header b'241' b'0'
Superfluous whitespace found in object header b'254' b'0'
Superfluous whitespace found in object header b'257' b'0'
Superfluous whitespace found in object header b'267' b'0'


Superfluous whitespace found in object header b'135' b'0'
Superfluous whitespace found in object header b'136' b'0'
Superfluous whitespace found in object header b'137' b'0'
Superfluous whitespace found in object header b'138' b'0'
Superfluous whitespace found in object header b'139' b'0'
Superfluous whitespace found in object header b'140' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'142' b'0'
Superfluous whitespace found in object header b'143' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'145' b'0'
Superfluous whitespace found in object header b'146' b'0'
Superfluous whitespace found in object header b'147' b'0'
Superfluous whitespace found in object header b'148' b'0'
Superfluous whitespace found in object header b'149' b'0'
Superfluous whitespace found in object header b'150' b'0'
Superfluous whitespace found in object header b'151' b'0'
Superfluous wh

Superfluous whitespace found in object header b'298' b'0'
Superfluous whitespace found in object header b'297' b'0'
Superfluous whitespace found in object header b'296' b'0'
Superfluous whitespace found in object header b'302' b'0'
Superfluous whitespace found in object header b'301' b'0'
Superfluous whitespace found in object header b'300' b'0'
Superfluous whitespace found in object header b'349' b'0'
Superfluous whitespace found in object header b'312' b'0'
Superfluous whitespace found in object header b'311' b'0'
Superfluous whitespace found in object header b'310' b'0'
Superfluous whitespace found in object header b'348' b'0'
Superfluous whitespace found in object header b'304' b'0'
Superfluous whitespace found in object header b'314' b'0'
Superfluous whitespace found in object header b'316' b'0'
Superfluous whitespace found in object header b'318' b'0'
Superfluous whitespace found in object header b'320' b'0'
Superfluous whitespace found in object header b'322' b'0'
Superfluous wh

Superfluous whitespace found in object header b'522' b'0'
Superfluous whitespace found in object header b'523' b'0'
Superfluous whitespace found in object header b'524' b'0'
Superfluous whitespace found in object header b'525' b'0'
Superfluous whitespace found in object header b'526' b'0'
Superfluous whitespace found in object header b'527' b'0'
Superfluous whitespace found in object header b'528' b'0'
Superfluous whitespace found in object header b'529' b'0'
Superfluous whitespace found in object header b'530' b'0'
Superfluous whitespace found in object header b'531' b'0'
Superfluous whitespace found in object header b'532' b'0'
Superfluous whitespace found in object header b'533' b'0'
Superfluous whitespace found in object header b'534' b'0'
Superfluous whitespace found in object header b'535' b'0'
Superfluous whitespace found in object header b'536' b'0'
Superfluous whitespace found in object header b'537' b'0'
Superfluous whitespace found in object header b'538' b'0'
Superfluous wh

Superfluous whitespace found in object header b'658' b'0'
Superfluous whitespace found in object header b'659' b'0'
Superfluous whitespace found in object header b'660' b'0'
Superfluous whitespace found in object header b'661' b'0'
Superfluous whitespace found in object header b'662' b'0'
Superfluous whitespace found in object header b'663' b'0'
Superfluous whitespace found in object header b'664' b'0'
Superfluous whitespace found in object header b'665' b'0'
Superfluous whitespace found in object header b'666' b'0'
Superfluous whitespace found in object header b'667' b'0'
Superfluous whitespace found in object header b'668' b'0'
Superfluous whitespace found in object header b'669' b'0'
Superfluous whitespace found in object header b'670' b'0'
Superfluous whitespace found in object header b'671' b'0'
Superfluous whitespace found in object header b'672' b'0'
Superfluous whitespace found in object header b'673' b'0'
Superfluous whitespace found in object header b'674' b'0'
Superfluous wh

                                            File Name  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             


Reader: PyPDF2            
QA:pipeline (deepset/roberta-base-squad2)        
Not cleaned 