-
Notifications
You must be signed in to change notification settings - Fork 0
/
textFrompdf.py
33 lines (26 loc) · 896 Bytes
/
textFrompdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pdfplumber
import PyPDF2
def textFromPdf(pdf_file):
pdfFileObj = open(pdf_file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pages = pdfReader.numPages
alltextlist = []
with pdfplumber.open(pdf_file) as pdf:
# Loop through the number of pages
for i in range(0, pages):
page = pdf.pages[i]
text = page.extract_text()
alltextlist.append(text)
pdfText = ','.join(alltextlist)
pdfText = pdfText.replace("\n", "")
pdfText = pdfText.replace("'", "")
pdfText = pdfText.replace('"', "")
pdfText = pdfText.replace("|", "")
pdfText = pdfText.replace(":", "")
pdfText = pdfText.replace(";", "")
pdfText = pdfText.replace("–", "")
pdfText = pdfText.replace("\uf0b7", "")
return pdfText
# pdfFile = 'static/files/TestPDF.pdf'
# pdfText = textFromPdf(pdfFile)
# print(pdfText)