In [1]:
#Modules
#Import ebooklib
import ebooklib
from ebooklib import epub
import re
import os
#Import nltk
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, stopwords, names
#Import gensim
import gensim

In [2]:
#EPUB files directory
epub_folder = ".\epubs"

In [3]:
#Limit Paragraph
def merge_strings_until_limit(strings, min_length, max_length, test_for_max = 0):
    merged_string = ""
    merged_strings = []
    
    for s in strings:
        if len(merged_string) <= min_length:
            merged_string += s
        
        elif len(merged_string) > max_length and test_for_max<5:
                splitParagraph = merged_string.split('.')
                splitParagraphRePoint = []
                for sp in splitParagraph:
                    splitParagraphRePoint.append(sp+'.')
                
                merged = merge_strings_until_limit(splitParagraphRePoint, min_length, max_length, test_for_max+1)
                merged_strings.extend(merged)
                merged_string = s
        else:
            merged_strings.append(merged_string)
            merged_string = s
    
    if merged_string:
        merged_strings.append(merged_string)
    
    return merged_strings

In [4]:
#Read .epub files in Paragraphs
def read_epub_paragraphs(epub_file, epub_ID):
    book = epub.read_epub(epub_file)
    paragraphs = []
    
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        content = item.get_content().decode('utf-8')
        content = re.sub('<[^<]+?>', '', content)
        content = re.sub('\s+', ' ', content)
        content = re.sub('\n', ' ', content)
        
        paragraphs.extend(content.strip().split("&#13;"))
    
    paragraphs = merge_strings_until_limit(paragraphs, 200, 1000)
    paragraphs = [{'TEXT':paragraphs[i], 'NR':i, 'Book ID':epub_ID} for i in range(len(paragraphs))]
    
    return paragraphs[1:-1]

In [5]:
#Testing
paragraphs = read_epub_paragraphs("epubs\A Philosophy of Curating.epub", 2)
paragraphs[10:11]

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


[{'TEXT': ' It also recognizes that all this activity is not founded on a solid intellectual basis that might empower its practitioners to have the critical courage to resist demands to simply supply more and more excitement to a market ravenous for spectacle and entertainment.',
  'NR': 11,
  'Book ID': 2}]

In [6]:
#Testing
paragraphs = read_epub_paragraphs("epubs\A Map to the Door of No Return - Dionne Brand.epub", 1)
paragraphs[10:11]

[{'TEXT': ' He raised his Sunday Guardian newspaper to block my view. He shooed me away, telling me to find some book to read or work to do. At times it seemed as if Papa was on the brink of remembering. I imagined pulling the word off his tongue if only I knew the first syllable.',
  'NR': 11,
  'Book ID': 1}]

In [7]:
#Download from nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
#Preprocess Words
ENGLISH_WORDS = set(words.words())

def is_english_word(word):
    return (word.lower() in ENGLISH_WORDS)

In [9]:
#Lemmatizer, Stop Words, Stemmer
lemmatizer = WordNetLemmatizer()
STOP_WORDS = stopwords.words("english")
stemmer = PorterStemmer()


def processed_documents(words): 
    #Lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    #Stop Words
    filtered_words = [word for word in lemmatized_words if ((word not in STOP_WORDS) and is_english_word(word))]
    #Stemmer
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    #Join
    return " ".join(stemmed_words)

In [15]:
def process_epub_files(epub_folder):
    output_folder = ".\processed_texts"
    os.makedirs(output_folder, exist_ok=True)

    for epub_file in os.listdir(epub_folder):
        if epub_file.endswith('.epub'):
            epub_path = os.path.join(epub_folder, epub_file)
            epub_ID = os.path.splitext(epub_file)[0]
            paragraphs = read_epub_paragraphs(epub_path, epub_ID)    
            print(f"Processing {epub_file}...")

            with open(os.path.join(output_folder, f"{epub_ID}_processed.txt"), 'w', encoding='utf-8') as output_file:
                for paragraph in paragraphs:
                    words = gensim.utils.simple_preprocess(paragraph['TEXT'], min_len = 3, deacc=True)
                    processed_text = processed_documents(words)
                    output_file.write(f"TEXT: {paragraph['TEXT']}, NR: {paragraph['NR']}, Book ID: {epub_ID}, Processed Text: {processed_text}\n")
                    

In [16]:
process_epub_files(epub_folder)

Processing A Map to the Door of No Return - Dionne Brand.epub...
Processing A Philosophy of Curating.epub...
Processing About Looking by John Berger.epub...
Processing Anne Sexton Poems.epub...
Processing Architecture and the Body by Kim Sexton.epub...
Processing Migration Crises and the Struct.epub...
Processing Society After Money Dialogue.epub...
Processing Speculative Everything by Anthony Dunne.epub...
Processing The Age of Gold - H. W. Brands.epub...
Processing The Poetry of Architecture by John Ruskin.epub...


In [12]:
#Import Pickle
import pickle

In [13]:
def save_to_pickle(epub_folder):
    output_folder = ".\pickle_file"
    os.makedirs(output_folder, exist_ok=True)

    for epub_file in os.listdir(epub_folder):
        if epub_file.endswith('.epub'):
            epub_path = os.path.join(epub_folder, epub_file)
            epub_ID = os.path.splitext(epub_file)[0]
            paragraphs = read_epub_paragraphs(epub_path, epub_ID)
            print(f"Saving {epub_file} to pickle...")

            processed_data = []
            for paragraph in paragraphs:
                    words = gensim.utils.simple_preprocess(paragraph['TEXT'], min_len = 3, deacc=True)
                    processed_text = processed_documents(words)
                    processed_data.append({'TEXT': paragraph['TEXT'], 'NR': paragraph['NR'], 'Book ID': epub_ID, 'Processed Text': processed_text})

            with open(os.path.join(output_folder, f"{epub_ID}_processed.pkl"), 'wb') as output_file:
                        pickle.dump(processed_data, output_file)
                

In [14]:
#Saving to Pickle
save_to_pickle(epub_folder)

Saving A Map to the Door of No Return - Dionne Brand.epub to pickle...
Saving A Philosophy of Curating.epub to pickle...
Saving About Looking by John Berger.epub to pickle...
Saving Anne Sexton Poems.epub to pickle...
Saving Architecture and the Body by Kim Sexton.epub to pickle...
Saving Migration Crises and the Struct.epub to pickle...
Saving Society After Money Dialogue.epub to pickle...
Saving Speculative Everything by Anthony Dunne.epub to pickle...
Saving The Age of Gold - H. W. Brands.epub to pickle...
Saving The Poetry of Architecture by John Ruskin.epub to pickle...
