In [1]:
#Modules
#Import ebooklib
import ebooklib
from ebooklib import epub
#Import fitz
import fitz
import re
import os
#Import nltk
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, stopwords, names
#Import gensim
import gensim

In [2]:
#PDF files directory
pdfs_folder = os.path.join("E:", "OneDrive", "Documents", "007-Study Life", "001-Urban Design", "RC11_Theory", "Philippe Morel Essays 2024")

In [3]:
# Constructing the path using double backslashes
pdfs_folder = "E:\\OneDrive\\Documents\\007-Study Life\\001-Urban Design\\RC11_Theory\\Philippe Morel Essays 2024"

# Alternatively, using a raw string
pdfs_folder = r"E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024"



In [11]:
if os.path.exists(pdfs_folder):
    print(f"Directory exists: {pdfs_folder}")
else:
    print(f"Directory does not exist: {pdfs_folder}")

Directory exists: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024


In [4]:
#Output folders for processed texts and pickle files
output_folder_txt = os.path.join("E:", "OneDrive", "Documents", "007-Study Life", "001-Urban Design", "RC11_Resources","Processed_Texts")
output_folder_pkl = os.path.join("E:", "OneDrive", "Documents", "007-Study Life", "001-Urban Design", "RC11_Resources", "Pickle_File")
os.makedirs(output_folder_txt, exist_ok=True)
os.makedirs(output_folder_pkl, exist_ok=True)

In [5]:
#Testing
def count_pdf_files(pdfs_folder):
    pdf_count = 0
    for root, dirs, files in os.walk(pdfs_folder):
        print(f"Checking directory: {root}")
        for file in files:
            print(f"File found: {file}")
            if file.endswith('.pdf') or file.endswith('.PDF'):
                pdf_path = os.path.join(root, file)
                print(f"Found PDF: {pdf_path}")
                pdf_count += 1
    return pdf_count

In [15]:
# Specify the folder containing the PDFs
pdfs_folder = os.path.join("E:", "OneDrive", "Documents", "007-Study Life", "001-Urban Design", "RC11_Theory", "Philippe Morel Essays 2024","EXTRA Philippe Morel Essays")

In [6]:
# Count the PDF files
pdf_count = count_pdf_files(pdfs_folder)
print(f"Total number of PDF files found: {pdf_count}")

Checking directory: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024
Checking directory: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024\EXTRA Philippe Morel Essays
File found: 000 - EP Magazine Vol.2 Design Fiction December 2016 MR.pdf
Found PDF: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024\EXTRA Philippe Morel Essays\000 - EP Magazine Vol.2 Design Fiction December 2016 MR.pdf
File found: 000 - fulcrum45_230512_computationalism.pdf
Found PDF: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024\EXTRA Philippe Morel Essays\000 - fulcrum45_230512_computationalism.pdf
File found: 000 - Interview_arte-TV_final.pdf
Found PDF: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024\EXTRA Philippe Morel Essays\000 - Interview_arte-TV_final.pdf
File found: 000 - Morel-Teissier Mathe

In [7]:
# Output folders for processed texts and pickle files
output_folder_txt = r"E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Resources\Processed_Texts"
output_folder_pkl = r"E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Resources\Pickle_File"

os.makedirs(output_folder_txt, exist_ok=True)
os.makedirs(output_folder_pkl, exist_ok=True)


In [8]:
#Limit Paragraph
def merge_strings_until_limit(strings, min_length, max_length, test_for_max = 0):
    merged_string = ""
    merged_strings = []
    
    for s in strings:
        if len(merged_string) <= min_length:
            merged_string += s
        
        elif len(merged_string) > max_length and test_for_max<5:
                splitParagraph = merged_string.split('.')
                splitParagraphRePoint = []
                for sp in splitParagraph:
                    splitParagraphRePoint.append(sp+'.')
                
                merged = merge_strings_until_limit(splitParagraphRePoint, min_length, max_length, test_for_max+1)
                merged_strings.extend(merged)
                merged_string = s
        else:
            merged_strings.append(merged_string)
            merged_string = s
    
    if merged_string:
        merged_strings.append(merged_string)
    
    return merged_strings

In [9]:
#Clean up text for PDF
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Replace newlines with space
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    return text.strip()

In [10]:
#Extract paragraphs from each PDF file with metadata
def read_pdf_paragraphs(pdf_path, book_id):
    paragraphs = []
    pdf_document = fitz.open(pdf_path)
    
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text_info = page.get_text("blocks", flags=1+2+8)
        
        for t in text_info:
            cleaned_text = clean_text(t[4])
            if cleaned_text:
                paragraphs.append(cleaned_text)

    pdf_document.close()
    
    paragraphs = merge_strings_until_limit(paragraphs, 200, 1000)
    paragraphs = [{'TEXT': paragraphs[i], 'LINE': i, 'BOOK': book_id} for i in range(len(paragraphs))]
    return paragraphs

In [11]:
#Download from nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
#Preprocess Words
ENGLISH_WORDS = set(words.words())

def is_english_word(word):
    return (word.lower() in ENGLISH_WORDS)

In [13]:
#Lemmatizer, Stop Words, Stemmer
lemmatizer = WordNetLemmatizer()
STOP_WORDS = stopwords.words("english")
stemmer = PorterStemmer()


def processed_documents(words): 
    #Lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    #Stop Words
    filtered_words = [word for word in lemmatized_words if ((word not in STOP_WORDS) and is_english_word(word))]
    #Stemmer
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    #Join
    return " ".join(stemmed_words)

In [14]:
def process_pdf_files(pdfs_folder, output_folder_txt):
    for root, dirs, files in os.walk(pdfs_folder):
        for pdf_file in files:
            if pdf_file.endswith('.pdf'):
                pdf_path = os.path.join(root, pdf_file)
                book_id = os.path.splitext(pdf_file)[0]
                paragraphs = read_pdf_paragraphs(pdf_path, book_id)
                print(f"Processing {pdf_file}...")

                # Define the path for the output text file
                output_file_path = os.path.join(output_folder_txt, f"{book_id}_processed.txt")
                
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    for paragraph in paragraphs:
                        words = gensim.utils.simple_preprocess(paragraph['TEXT'], min_len=3, deacc=True)
                        processed_text = processed_documents(words)  # Apply full processing pipeline
                        output_file.write(f"TEXT: {paragraph['TEXT']}, LINE: {paragraph['LINE']}, BOOK: {book_id}, KEYWORD: {processed_text}\n")

In [16]:
process_pdf_files(pdfs_folder, output_folder_txt)

Processing 000 - EP Magazine Vol.2 Design Fiction December 2016 MR.pdf...
Processing 000 - fulcrum45_230512_computationalism.pdf...
Processing 000 - Interview_arte-TV_final.pdf...
Processing 000 - Morel-Teissier Mathematica IMS04 Banff.pdf...
Processing 000 - Philippe Morel DETAIL Arch&Automation October 2019 - SCAN.pdf...
Processing 000 - Philippe MOREL Supreme Achievement Scan MR.pdf...
Processing 000 - Tracés 12 SCAN LR.pdf...
Processing 001 - Philippe Morel légitimité_archi_postmoderne SCAN Original LR.pdf...
Processing 002 - Kritische Berichte 3-1999 Cover and Content Page SCAN.pdf...
Processing 003 - Notes sur la technologie SCAN Original LR.pdf...
Processing 006 - Philippe Morel Reading MVRDV LOWRES.pdf...
Processing 009 - Notes_on_computational_architecture_Optimization.pdf...
Processing 011 - Philippe_Morel_EZCT_Archilab_ENGLISH_FINAL.pdf...
Processing 014 - n extensions a Extensions_Ph-Morel_Final_Multitudes FR.pdf...
Processing 014 B - n extensions a Extensions_Ph-Morel_Fina

FileDataError: Failed to open file 'E:\\OneDrive\\Documents\\007-Study Life\\001-Urban Design\\RC11_Theory\\Philippe Morel Essays 2024\\SYLLABUS 2 History and Theory of Architecture & Art\\Sanford Kwinter_Landscapes of Change.pdf'.

In [17]:
#Import Pickle
import pickle

In [18]:
def save_to_pickle_for_pdfs(pdfs_folder, output_folder_pkl):
    for root, dirs, files in os.walk(pdfs_folder):
        for pdf_file in files:
            if pdf_file.endswith('.pdf'):
                pdf_path = os.path.join(root, pdf_file)
                book_id = os.path.splitext(pdf_file)[0]
                paragraphs = read_pdf_paragraphs(pdf_path, book_id)
                print(f"Saving {pdf_file} to pickle...")

                processed_data = []
                for paragraph in paragraphs:
                    words = gensim.utils.simple_preprocess(paragraph['TEXT'], min_len=3, deacc=True)
                    processed_text = processed_documents(words)  # Apply full processing pipeline
                    processed_data.append({'TEXT': paragraph['TEXT'], 'LINE': paragraph['LINE'], 'BOOK': book_id, 'KEYWORD': processed_text})

                # Define the path for the output pickle file
                output_file_path = os.path.join(output_folder_pkl, f"{book_id}_processed.pkl")
                
                # Save the processed data to the pickle file
                with open(output_file_path, 'wb') as output_file:
                    pickle.dump(processed_data, output_file)

In [19]:
for root, dirs, files in os.walk(pdfs_folder):
    print("Current Directory:", root)
    print("Directories:", dirs)
    print("Files:", files)
    print()

Current Directory: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024
Directories: ['EXTRA Philippe Morel Essays', 'SYLLABUS 1 Epistemology of Computation CA & AI', 'SYLLABUS 10 Life Science Physics Etc', 'SYLLABUS 2 History and Theory of Architecture & Art', 'SYLLABUS 3 Economics Networks Computational Socialism', 'SYLLABUS 4 Ethics of Computation Nanotech Robotics', 'SYLLABUS 5 General Reports Global Politics and Climate', 'SYLLABUS 6 Selected Writings of Giuseppe Longo', 'SYLLABUS 7 General History & Philosophy of Math & Computation', 'SYLLABUS 8 Politics Philosophy Critique of Capitalism', 'SYLLABUS 9 General Epistemology & Philosophy of Perception']
Files: []

Current Directory: E:\OneDrive\Documents\007-Study Life\001-Urban Design\RC11_Theory\Philippe Morel Essays 2024\EXTRA Philippe Morel Essays
Directories: []
Files: ['000 - EP Magazine Vol.2 Design Fiction December 2016 MR.pdf', '000 - fulcrum45_230512_computationalism.pdf', '000 - Int

In [20]:
#Saving to Pickle
save_to_pickle_for_pdfs(pdfs_folder, output_folder_pkl)

Saving 000 - EP Magazine Vol.2 Design Fiction December 2016 MR.pdf to pickle...
Saving 000 - fulcrum45_230512_computationalism.pdf to pickle...
Saving 000 - Interview_arte-TV_final.pdf to pickle...
Saving 000 - Morel-Teissier Mathematica IMS04 Banff.pdf to pickle...
Saving 000 - Philippe Morel DETAIL Arch&Automation October 2019 - SCAN.pdf to pickle...
Saving 000 - Philippe MOREL Supreme Achievement Scan MR.pdf to pickle...
Saving 000 - Tracés 12 SCAN LR.pdf to pickle...
Saving 001 - Philippe Morel légitimité_archi_postmoderne SCAN Original LR.pdf to pickle...
Saving 002 - Kritische Berichte 3-1999 Cover and Content Page SCAN.pdf to pickle...
Saving 003 - Notes sur la technologie SCAN Original LR.pdf to pickle...
Saving 006 - Philippe Morel Reading MVRDV LOWRES.pdf to pickle...
Saving 009 - Notes_on_computational_architecture_Optimization.pdf to pickle...
Saving 011 - Philippe_Morel_EZCT_Archilab_ENGLISH_FINAL.pdf to pickle...
Saving 014 - n extensions a Extensions_Ph-Morel_Final_Multi

FileDataError: Failed to open file 'E:\\OneDrive\\Documents\\007-Study Life\\001-Urban Design\\RC11_Theory\\Philippe Morel Essays 2024\\SYLLABUS 2 History and Theory of Architecture & Art\\Sanford Kwinter_Landscapes of Change.pdf'.

In [None]:
#Read .epub files in Paragraphs
def read_epub_paragraphs(epub_file, epub_ID):
    #Read the .epub file
    book = epub.read_epub(epub_file)
    paragraphs = []
    
    #Extract and clean text from each document item in the epub
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        content = item.get_content().decode('utf-8')
        content = re.sub(r'<[^<]+?>', '', content)  # Remove HTML tags
        content = re.sub(r'\s+', ' ', content)      # Collapse multiple spaces
        content = re.sub(r'\n', ' ', content)       # Remove newlines
        
        #Split the content by paragraph marker and add to paragraphs list
        paragraphs.extend(content.strip().split("&#13;"))
    
    #Combine strings into paragraphs within length limits
    paragraphs = merge_strings_until_limit(paragraphs, 200, 1000)
    
    #Format each paragraph with metadata
    paragraphs = [{'TEXT': paragraphs[i], 'LINE': i, 'BOOK': epub_ID} for i in range(len(paragraphs))]
    
    #Return paragraphs excluding the first and last elements
    return paragraphs[1:-1]

In [8]:
#Testing
#Build file path using os.path.join() to ensure cross-platform compatibility (Windows & MacOS)
file_path = os.path.join("epubs", "A Philosophy of Curating.epub")

#Call the function with the file path and the epub ID
paragraphs = read_epub_paragraphs(file_path, 2)
print(paragraphs[10:11])


[{'TEXT': ' It also recognizes that all this activity is not founded on a solid intellectual basis that might empower its practitioners to have the critical courage to resist demands to simply supply more and more excitement to a market ravenous for spectacle and entertainment.', 'LINE': 11, 'BOOK': 2}]


  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [10]:
#Testing
paragraphs = read_epub_paragraphs(file_path, 1)
paragraphs[10:11]

[{'TEXT': ' It also recognizes that all this activity is not founded on a solid intellectual basis that might empower its practitioners to have the critical courage to resist demands to simply supply more and more excitement to a market ravenous for spectacle and entertainment.',
  'LINE': 11,
  'BOOK': 1}]

In [None]:
def process_epub_files(epub_folder):
    for epub_file in os.listdir(epub_folder):
        if epub_file.endswith('.epub'):
            #Build the file paths using os.path.join()
            epub_path = os.path.join(epub_folder, epub_file)
            epub_ID = os.path.splitext(epub_file)[0]
            paragraphs = read_epub_paragraphs(epub_path, epub_ID)    
            print(f"Processing {epub_file}...")

            #Create the output file path using os.path.join()
            output_file_path = os.path.join(output_folder_txt, f"{epub_ID}_processed.txt")
            
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for paragraph in paragraphs:
                    #Process the text and write it to the output file
                    words = gensim.utils.simple_preprocess(paragraph['TEXT'], min_len=3, deacc=True)
                    processed_text = processed_documents(words)
                    output_file.write(f"TEXT: {paragraph['TEXT']}, LINE: {paragraph['LINE']}, BOOK: {epub_ID}, KEYWORD: {processed_text}\n")

In [None]:
process_epub_files(epub_folder)

In [None]:
def save_to_pickle(epub_folder):
    for epub_file in os.listdir(epub_folder):
        if epub_file.endswith('.epub'):
            #Build the file paths using os.path.join()
            epub_path = os.path.join(epub_folder, epub_file)
            epub_ID = os.path.splitext(epub_file)[0]
            paragraphs = read_epub_paragraphs(epub_path, epub_ID)
            print(f"Saving {epub_file} to pickle...")

            processed_data = []
            for paragraph in paragraphs:
                words = gensim.utils.simple_preprocess(paragraph['TEXT'], min_len=3, deacc=True)
                processed_text = processed_documents(words)
                processed_data.append({'TEXT': paragraph['TEXT'], 'LINE': paragraph['LINE'], 'BOOK': epub_ID, 'KEYWORD': processed_text})

            #Use os.path.join() to create the output pickle file path
            output_file_path = os.path.join(output_folder_pkl, f"{epub_ID}_processed.pkl")

            #Save the processed data to pickle file
            with open(output_file_path, 'wb') as output_file:
                pickle.dump(processed_data, output_file)

In [None]:
#Saving to Pickle
save_to_pickle(epub_folder)