In [4]:
import os
import pandas as pd
from transformers import pipeline
from pdfminer.high_level import extract_text

def read_pdf(file_path):
    """
    Read a PDF file and return its text content using Pdfminer.

    Args:
        file_path (str): The path of the PDF file.

    Returns:
        str: The text content of the PDF file.
    """
    text = extract_text(file_path)
    return text

def extract_company_name(text):
    """
    Use a pre-trained question-answering model to extract the company name from the text.

    Args:
        text (str): The text content to analyze.

    Returns:
        str: The company name.
    """
    nlp = pipeline('question-answering')
    question = "What is the name of the company?"
    answer = nlp(question=question, context=text)
    return answer['answer']

def process_pdf_files(directory):
    """
    Read PDF files in a directory, extract company names, and build a pandas DataFrame.

    Args:
        directory (str): The path of the directory containing the PDF files.

    Returns:
        pd.DataFrame: A pandas DataFrame with the file names and company names.
    """
    data = []

    for file_name in os.listdir(directory):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(directory, file_name)
            text = read_pdf(file_path)
            company_name = extract_company_name(text)
            file_name_without_extension = file_name[:-4]  # Remove ".pdf" from the file name
            data.append({'File Name': file_name_without_extension, 'Company Name': company_name})

    df = pd.DataFrame(data)
    return df


# Set the path of the directory containing the PDF files
pdf_directory = '/Users/amin/Desktop/33/test'

# Process the PDF files and get a pandas DataFrame
result_df = process_pdf_files(pdf_directory)
print(result_df)

# Save the DataFrame to a CSV file
result_df.to_csv('Name_Finder_pdfminer_NoCln_QA1_prmt1.csv', index=False)


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production 

                                            File Name  \
0                      Zehna Corporate Deck SymBiosis   
1                                 PS_TCRR_NOTE_1.5.23   
2                      Grace Sciece Platform (9.2.20)   
3                       AcuamarkDx Series A_Corp Deck   
4                                     coding.bio_deck   
5         Centurion_BioPharma-Non-Confidential_2021_2   
6                              JPM_SRZN_NOTE_11.28.22   
7                                   G_ACET_IOC_4.8.21   
8                                C_ARQT_NOTE_11.15.22   
9                                  B_RXRX_IOC_9.21.21   
10               Dianomi Non-Confidential Slides - v3   
11                           KBC_MAAT.FR_IOC_12.14.21   
12                               Engrail Therapeutics   
13                                 WB_VOR_IOC_1.25.22   
14                                 E_FNCH_IOC_8.10.21   
15                                CG_GRNA_IOC_12.8.22   
16                             


Reader: pdfminer            
QA:pipeline (distilbert-base-cased-distilled-squad model)        
Not cleaned       prompt: What is the name of the company?