In [19]:
import os
import fitz  # PyMuPDF

def extract_abstract_from_pdf(file_path):
    doc = fitz.open(file_path)
    abstract = ''

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text()

        if 'abstract' in text.lower():
            abstract_start = text.lower().index('abstract')
            abstract = text[abstract_start:]
            
            # Find the next section or delimiter
            next_section_index = text.lower().find('introduction', abstract_start)
            if next_section_index == -1:
                next_section_index = text.lower().find('keywords', abstract_start)
            if next_section_index == -1:
                next_section_index = text.lower().find('Zusammenfassung', abstract_start)
            if next_section_index == -1:
                next_section_index = len(text)
                
            abstract = abstract[:next_section_index]
            break

    return abstract

def extract_abstracts_from_folder(folder_path):
    abstracts = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            abstract = extract_abstract_from_pdf(file_path)
            abstracts[filename] = abstract

    return abstracts

# Provide the path to the folder containing the PDF files
pdf_folder_path = 'papers'

# Extract abstracts from PDFs in the folder
abstracts = extract_abstracts_from_folder(pdf_folder_path)

# Print the abstracts
# for filename, abstract in abstracts.items():
#     print(f'Filename: {filename}')
#     print(f'Abstract: {abstract}')
#     print('---')

In [11]:
import os
import PyPDF2 as pdf

def extract_text_from_pdf(file_path):
    reader = pdf.PdfFileReader(file_path,strict=False)
    fulltext = []
    print(f"Start scanning {file_path} ")
    for page_num in range(reader.getNumPages()):
        page = reader.getPage(page_num)
        text = page.extractText()
        fulltext.append(text)

    return fulltext

def extract_text_from_folder(folder_path):
    abstracts = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            fulltext = extract_text_from_pdf(file_path)
            abstracts[filename] = fulltext

    return abstracts

pdf_folder_path = 'test'
#extract_text_from_pdf('20170120 SN Sergey Nasekin DISS.pdf')


extract_text_from_folder(pdf_folder_path)



Multiple definitions in dictionary at byte 0x75c6fd for key /Lang


Start scanning test\20210806 DJ Daniel Jacob DISS.pdf


str

In [7]:
import PyPDF2 as pdf
reader = pdf.PdfFileReader('20220820 K2 K Khowaja DISS.pdf',strict=False)
fulltext = ''
for page_num in range(reader.getNumPages()):
    page = reader.getPage(page_num)
    text = page.extractText()
    fulltext += str(text)

print(fulltext)


D i m e n s i o n F l e x i b l e a n d Ad a p t i ve
S t a t i s t i c a l L e a r n i n g
D I S S E RTAT I O N
zur Erlangung des akademischen Grades
doctor rerum politicarum
(Doktor der Wirtschaftswissenschaft)
eingereicht an der
Wirtschaftswissenschaftlichen Fakultät
der Humboldt-Universität zu Berlin
von
Kainat Khowaja
geboren am 19.09.1994 in Thatta
Präsidentin der Humboldt-Universität zu Berlin (kommissarisch):
Prof. Dr. Peter Frensch
Dekan der Wirtschaftswissenschaftlichen Fakultät:
Prof. Dr. Daniel Klapper
Gutachter: 1. Prof. Dr. Wolfgang Karl Härdle, Ph.D.
2. Prof. Dr. Weining Wang, Ph.D.
Tag des Kolloquiums:2Acknowledgments
Before I conclude this important chapter of my life, I would like to appreciate the
support of many individuals who have contributed in various ways to make this
achievement possible. First and foremost, I would like to express my heartfelt gratitude
to my first supervisor Prof. Dr. Wolfgang Karl Härdle, who guided me throughout
the course of last three ye

In [31]:
import os
import PyPDF2 as pdf
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def create_string(file_path):
    '''Transform a PDF file to a list of string pages'''

    # convert PDF to readable file
    transformed_pdf = pdf.PdfFileReader(file_path, strict=False)
    
    # get number of pages
    totalpages = transformed_pdf.numPages
    
    # read the data and store in a list
    pdf_output = [transformed_pdf.getPage(i) for i in range(totalpages)]

    # extract result
    pdf_output = [pdf_output[i].extractText() for i in range(totalpages)]
    
    return pdf_output, totalpages 

def cleaning(file_path):

    '''Initial PDF cleaning procedure'''
    
    pdf_output, totalpages = create_string(file_path)
    # # cleaning urls
    pdf_output = [re.sub(pattern = "http[^ ]*", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    # # cleaning symbols
    pdf_output = [re.sub(pattern = "\\n", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    pdf_output = [re.sub(pattern = "\W|\d", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    pdf_output = [re.sub(pattern = "[^a-zA-Z]", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    
    # # cleaning multispaces
    pdf_output = [re.sub(pattern = "\s{2,}", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    
    # # cleaning out 1-2-worders
    pdf_output = [re.sub(pattern = " .{1,2} ", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    pdf_output = [re.sub(pattern = " .{1,2} ", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    pdf_output = [re.sub(pattern = " .{1,2} ", repl = " ", string = pdf_output[i]) for i in range(totalpages)]
    
    # # lower-casing
    pdf_output = [pdf_output[i].lower() for i in range(totalpages)]
    pdf_output = [[ps.stem(word) for word in sentence.split(" ")] for sentence in pdf_output]
    pdf_output = [' '.join(pdf_output[i]) for i in range(len(pdf_output))]
    
    return pdf_output

def extract_cleantext_from_folder(folder_path):
    '''Extract cleaned pdf text from folder'''
    cleantext = ''

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            cleantext += str(cleaning(file_path))
            
    return cleantext

pdf_folder_path = 'test'
#extract_text_from_pdf('20170120 SN Sergey Nasekin DISS.pdf')


extracted_text = extract_cleantext_from_folder(pdf_folder_path)

text_file = open("cleantext.txt", "w")
text_file.write(extracted_text)
 
#close file
text_file.close()

Multiple definitions in dictionary at byte 0x75c6fd for key /Lang
