In [1]:
import os
import pdfplumber
import re
import glob
from nltk.tokenize import sent_tokenize
import pdfminer

## - Functions

In [27]:
# Get all the subfolders from a folder
def fast_scandir(dirname):
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir() and ".git" not in f.path]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders

# Get all the pdf files from a folder path
def get_pdf_files(folderPath):
    return glob.glob(os.path.join(folderPath, "*.pdf"))

# Import the text from pdf file
def load_pdf_text(filePath):
    all_text = ""
    pdf = pdfplumber.open(filePath)
    for i in range(len(pdf.pages)):
        page = pdf.pages[i]
        all_text += re.sub("\s+", " ", str(page.extract_text()))
    pdf.close()
    # Split the lines based on a period, question mark, or something else (tokenize sentences)
    return sent_tokenize(all_text)

# Save the text to txt file
def save_to_txt(filePath):
    text = load_pdf_text(filePath)
    txtFile = filePath[:-3] + "txt"
    with open(txtFile, mode='wt', encoding='utf-8') as myfile:
        for line in text:
            myfile.write(line)
            myfile.write('\n')

# Convert PDF to txt
def convert_pdf_to_txt(folderPath):
    file_list = get_pdf_files(folderPath)
    for file in file_list:
        save_to_txt(file)
        
# This function is to fix the "PDF" files problem
def find_files_end_with_PDF(folderPath):
    ans = []
    for file in get_pdf_files(folder):
        if "pdf" != file[-3:len(file)]:
            ans.append(file)
    return ans

## - Get the folder's path

In [3]:
folders = fast_scandir("./tbooks/")

In [4]:
folders

['./tbooks/med',
 './tbooks/nou100',
 './tbooks/nou101',
 './tbooks/nou200',
 './tbooks/nou300',
 './tbooks/nou400',
 './tbooks/nou700',
 './tbooks/nou800',
 './tbooks/nouALLED1',
 './tbooks/nouALLED2',
 './tbooks/nouBusLower1',
 './tbooks/nouBusLower2',
 './tbooks/nouBusUpper1',
 './tbooks/nouBusUpper2',
 './tbooks/nouBusUpper3',
 './tbooks/noudl01',
 './tbooks/noudl02',
 './tbooks/noudl03',
 './tbooks/nouLaw',
 './tbooks/nouSST100',
 './tbooks/nouSST200',
 './tbooks/nouSST300',
 './tbooks/nouSST301',
 './tbooks/nouSST400',
 './tbooks/nouSST401',
 './tbooks/nouSST500',
 './tbooks/nouSST700',
 './tbooks/nouSST800',
 './tbooks/pdfssailor']

In [6]:
folders[24:26]

['./tbooks/nouSST401', './tbooks/nouSST500']

## - Iterate through all the folders in tbooks

In [28]:
for i in range(25, len(folders)):
    convert_pdf_to_txt(folders[i])
    print("Sucess: ", folders[i])

Sucess:  ./tbooks/nouSST500
Sucess:  ./tbooks/nouSST700
Sucess:  ./tbooks/nouSST800
Sucess:  ./tbooks/pdfssailor


## - Skip Damaged File: CIT 467 VISUAL PROGRAMMING LANGUAGE 2 in nouSST401

In [24]:
nouSST401 = get_pdf_files(folders[24])
for i in range(len(nouSST401)):
    if i == 1:
        continue
    curr_pdf = nouSST401[i]
    save_to_txt(curr_pdf)
    print("Sucess:", curr_pdf)

Sucess: ./tbooks/nouSST401\CIT 465.pdf
Sucess: ./tbooks/nouSST401\CIT 478 Artificial Intelligence(1).pdf
Sucess: ./tbooks/nouSST401\CIT 478 Artificial Intelligence.pdf
Sucess: ./tbooks/nouSST401\CIT 484 - WEBSITE DESIGN   - Content F.pdf
Sucess: ./tbooks/nouSST401\cps401 BIOTECHNOLOGY IN CROP_Olaniyi and Petu-Ibikunle_on net.pdf
Sucess: ./tbooks/nouSST401\DAM 401 Oracle Applications.pdf
Sucess: ./tbooks/nouSST401\DAM 461.pdf
Sucess: ./tbooks/nouSST401\DAM 462.pdf
Sucess: ./tbooks/nouSST401\DAM 463_Health Data Management.pdf
Sucess: ./tbooks/nouSST401\ENT 417.pdf
Sucess: ./tbooks/nouSST401\ESM 403 ENVT PERCEPTION UNEDITED.pdf
Sucess: ./tbooks/nouSST401\ESM 405 EPA.pdf
Sucess: ./tbooks/nouSST401\ESM 407 GIS.pdf
Sucess: ./tbooks/nouSST401\ESM 411 Population, Environment & Development.pdf
Sucess: ./tbooks/nouSST401\ESM 421 ELEMENTS OF LAND SURVEYING unedited.pdf
Sucess: ./tbooks/nouSST401\ESM 423 HYDROLOGY & WATER RESOURCES DEV.pdf
Sucess: ./tbooks/nouSST401\esm 424 Freshwater ecology.pdf


## - Fix nouSST400/BIO 413 MAIN TEXT page 188 problem

In [49]:
# Import the text from pdf file
def special_load_pdf_text_188(filePath):
    all_text = ""
    pdf = pdfplumber.open(filePath)
    for i in range(len(pdf.pages)):
        if i == 188:
            continue
        page = pdf.pages[i]
        all_text += re.sub("\s+", " ", str(page.extract_text()))
    pdf.close()
    # Split the lines based on a period, question mark, or something else (tokenize sentences)
    return sent_tokenize(all_text)

In [50]:
save_to_txt(special_load_pdf_text_188(get_pdf_files(folders[23])[17]), get_pdf_files(folders[23])[17])
print("Sucess!")

Sucess!


In [54]:
# Remaining files in nouSST400
pFiles = get_pdf_files(folders[23])[18:]
for pFile in pFiles:
    save_to_txt(load_pdf_text(pFile), pFile)

## - Fix the PDF files' problem

In [94]:
for i in range(10,len(folders)):
    folder = folders[i]
    file_list = find_files_end_with_PDF(folder)
    if len(file_list) > 0:
        print(folder + ":")
        for file in file_list:
            print(file)
        print("\n")

./tbooks/nouBusLower1:
./tbooks/nouBusLower1\BHM 302.PDF


./tbooks/noudl03:
./tbooks/noudl03\PED 271 MAIN TEXT.PDF
./tbooks/noudl03\PED 322.PDF
./tbooks/noudl03\PED 412.PDF


./tbooks/nouSST100:
./tbooks/nouSST100\CHM 121 Introduction to Physical Chemistry II.PDF
./tbooks/nouSST100\CHM 191 INTRODUCTORY PRACTICAL CHEMISTRY I.PDF




In [93]:
for i in range(2,10):
    folder = folders[i]
    file_list = find_files_end_with_PDF(folder)
    if len(file_list) > 0:
        print("Do: ", folder)
        for file in file_list:
            raw_text = load_pdf_text(file)
            save_to_txt(raw_text, file)
            print("Success:", file)
        print("\n")

Do:  ./tbooks/nou200
Success: ./tbooks/nou200\PCR 272 Concepts and Practice of Peacebuilding.PDF


Do:  ./tbooks/nouALLED1
Success: ./tbooks/nouALLED1\EDU 292 MAIN.PDF


Do:  ./tbooks/nouALLED2
Success: ./tbooks/nouALLED2\PED 110.PDF
Success: ./tbooks/nouALLED2\PED 121.PDF
Success: ./tbooks/nouALLED2\PED 130 FINAL.PDF
Success: ./tbooks/nouALLED2\PED 230.PDF
Success: ./tbooks/nouALLED2\PED 232.PDF
Success: ./tbooks/nouALLED2\PED 237 MAIN.PDF
Success: ./tbooks/nouALLED2\PED 313 HISTORY AND CULTURA.PDF
Success: ./tbooks/nouALLED2\PED 320.PDF
Success: ./tbooks/nouALLED2\PED 410.PDF
Success: ./tbooks/nouALLED2\PED 420.PDF
Success: ./tbooks/nouALLED2\PED 421 DEVPTAL GUIDANCE....PDF
Success: ./tbooks/nouALLED2\PED 422.PDF
Success: ./tbooks/nouALLED2\PED 431 OSUJI  MAIN TEXT.PDF
Success: ./tbooks/nouALLED2\PRD 104 MAIN.PDF
Success: ./tbooks/nouALLED2\PRD 110 MAIN.PDF
Success: ./tbooks/nouALLED2\PRD 208 MAIN.PDF
Success: ./tbooks/nouALLED2\PRD 235 MAIN.PDF
Success: ./tbooks/nouALLED2\PRD 433.PD

In [83]:
    for file in file_list:
        raw_text = load_pdf_text(file)
        save_to_txt(raw_text, file)

['./tbooks/nou100\\BIO 001 Access Biology.PDF', './tbooks/nou100\\CHM 001 Access Chemistry.PDF']
