In [25]:
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import pandas as pd
import json
import os

In [23]:
def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            
            text = fake_file_handle.getvalue()
            yield text
    
            # close open handles
            converter.close()
            fake_file_handle.close()
    
def extract_text(pdf_path):
    for page in extract_text_by_page(pdf_path):
        print(page)
        print()

In [24]:
def convert_pdf_to_txt(path,password="",maxpages=0):
    '''ref number 26494211'''
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams) # codec=codec, 
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [55]:
def export_as_json(pdf_path, json_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    data = {'Filename': filename}
    data['Pages'] = []
    
    counter = 1
    for page in extract_text_by_page(pdf_path):
        #text = page[0:100]
        text = page[0:len(page)]
        #page = {f'Page_{counter}': text}
        data['Pages'].append(text)
        counter += 1
    
    with open(json_path, 'w') as fh:
        json.dump(data, fh)
        
    return data

In [66]:
path = "YLB_Nisan 2022 Portföy Da__l_m Raporu Bildirimi.pdf"

text = convert_pdf_to_txt(path, password="", maxpages=0)

In [56]:
pdf_path = "YLB_Nisan 2022 Portföy Da__l_m Raporu Bildirimi.pdf"
json_path = "YLB_Nisan 2022 Portföy Da__l_m Raporu Bildirimi.json"

jdata = export_as_json(pdf_path, json_path)

In [58]:
for p in jdata["Pages"]:
    print(p)

list

In [None]:
print(text)

In [76]:
(text.index("PORTFÖYDEN"), text.index("PORTFÖYE"))

(10777, 14777)

In [84]:
text[10777:10799]

'PORTFÖYDEN SATIŞLAR\n\n:'

In [83]:
text[14777:14795]

'PORTFÖYE ALIŞLAR\n\n'