**This code is designed to preprocess and analyze textual data from fanfiction using the BookNLP framework.**

The primary goal is to prepare text data for advanced natural language processing tasks such as entity recognition, coreference resolution, event detection, and quote attribution.

The code integrates BookNLP with custom pre-processing of machine learning models to enhance compatibility and performance.

**The methodology involves three key steps:**

    1. Model Preprocessing: Custom models are loaded, and unnecessary parameters like position IDs
    are removed to optimize their structure.

    2. Text Processing: The script scans a designated directory for .txt files, assigns unique identifiers to each file,
    and sets up corresponding output directories for processed data.
    
    3. NLP Analysis: Each text file is analyzed by BookNLP, with outputs saved systematically for downstream analysis.
    The framework efficiently handles multiple files and ensures compatibility with GPU acceleration
    for improved processing speed.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install booknlp
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m117.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from booknlp.booknlp import BookNLP
from pathlib import Path
import os
import torch

# Set up your input and output directories
# In this scenario, .txt files are stored directly in "fanfiction project_texts"
input_root = Path("/content/drive/MyDrive/fanfiction project_texts")
output_root = Path("/content/drive/MyDrive/fanfiction project_texts/booknlp_outputs")
output_root.mkdir(parents=True, exist_ok=True)

def remove_position_ids_and_save(model_file, device, save_path):
    state_dict = torch.load(model_file, map_location=device)
    if 'bert.embeddings.position_ids' in state_dict:
        print(f'Removing "position_ids" from the state dictionary of {model_file}')
        del state_dict['bert.embeddings.position_ids']
    torch.save(state_dict, save_path)
    print(f'Modified state dict saved to {save_path}')

def process_model_files(model_params, device):
    updated_params = {}
    for key, path in model_params.items():
        if isinstance(path, str) and os.path.isfile(path) and path.endswith('.model'):
            save_path = path.replace('.model', '_modified.model')
            remove_position_ids_and_save(path, device, save_path)
            updated_params[key] = save_path
        else:
            updated_params[key] = path
    return updated_params

def process_books_with_booknlp(input_root, output_root):
    user_dir = Path.home()

    # Set up model parameters with paths to BookNLP models
    model_params = {
        'pipeline': 'entity,quote,supersense,event,coref',
        'model': 'custom',
        'entity_model_path': f'{user_dir}/booknlp_models/entities_google_bert_uncased_L-6_H-768_A-12-v1.0.model',
        'coref_model_path': f'{user_dir}/booknlp_models/coref_google_bert_uncased_L-12_H-768_A-12-v1.0.model',
        'quote_attribution_model_path': f'{user_dir}/booknlp_models/speaker_google_bert_uncased_L-12_H-768_A-12-v1.0.1.model',
        'bert_model_path': f'{user_dir}/.cache/huggingface/hub/'
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_params = process_model_files(model_params, device)  # Modify model files if needed

    # Initialize BookNLP
    booknlp = BookNLP('en', model_params)

    # --- KEY CHANGE HERE ---
    # Instead of iterating over subfolders, we directly glob all *.txt files in input_root
    txt_files = list(input_root.glob("*.txt"))
    if not txt_files:
        print(f"No TXT files found directly under {input_root}")
        return

    for input_file in txt_files:
        # Only proceed if it's really a file
        if input_file.is_file():
            # Create a book ID from the file name
            book_id = input_file.stem.replace(' ', '_')

            # Create an output folder per text file
            book_output_dir = output_root / f"{book_id}_BookNLP_output"
            book_output_dir.mkdir(parents=True, exist_ok=True)

            # Process each .txt file with BookNLP
            try:
                print(f"Processing {input_file} with BookNLP...")
                booknlp.process(input_file, book_output_dir, book_id)
                print(f"Processed {input_file}; output saved in {book_output_dir}")
            except Exception as e:
                print(f"Error processing {input_file}: {e}")

# Run the processing function
process_books_with_booknlp(input_root, output_root)

# The same code with added part for parsing books in EPUB format and turning them in TXT format before processing texts with BookNLP pipeline (since BookNLP is not working with any text formats exept TXT)

In [None]:
! pip install ebooklib
! pip install beautifulsoup4

import os
from pathlib import Path
from ebooklib import epub
from ebooklib import ITEM_DOCUMENT
from bs4 import BeautifulSoup

def convert_epub_to_txt(epub_path, txt_output_folder):
    """
    Converts an EPUB file to a TXT file and saves it to a specified output folder.
    Args:
        epub_path (str): Path to the EPUB file.
        txt_output_folder (str): Directory where the converted TXT file will be saved.
    """
    # Load the EPUB file
    book = epub.read_epub(epub_path)
    text_content = []

    # Extract text from each document in the EPUB file
    for item in book.get_items():
        if item.get_type() == ITEM_DOCUMENT:
            # Parse HTML content and extract text
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            text_content.append(soup.get_text())

    # Join all extracted text into a single string
    full_text = '\n'.join(text_content)

    # Write the text to a .txt file
    book_name = Path(epub_path).stem  # Get book name without extension
    os.makedirs(txt_output_folder, exist_ok=True)  # Create the book-specific output directory
    txt_file_path = os.path.join(txt_output_folder, f"{book_name}.txt")

    with open(txt_file_path, 'w', encoding='utf-8') as f:
        f.write(full_text)

    print(f"Converted {epub_path} to {txt_file_path}")

def convert_all_epubs_to_txt(input_root, txt_output_root):
    """
    Converts all EPUB files in a root directory to TXT files, saving each as specified.
    Args:
        input_root (str): Root directory containing EPUB files to convert.
        txt_output_root (str): Root directory to store the converted TXT files.
    """
    for epub_path in Path(input_root).rglob("*.epub"):  # Find all .epub files recursively
        book_name = epub_path.stem  # Use the EPUB file name without extension
        book_output_folder = os.path.join(txt_output_root, book_name)  # Create a subfolder for each book
        convert_epub_to_txt(epub_path, book_output_folder)  # Convert and save to book-specific folder

if __name__ == "__main__":
    # Input root directory containing EPUB files
    input_root = '/content/drive/MyDrive/full_list_of_romantic_novels'
    # Output root directory where TXT files will be saved
    txt_output_root = os.path.join(input_root, "txt_books")

    # Ensure the root output directory exists
    os.makedirs(txt_output_root, exist_ok=True)

    # Convert all EPUB files to TXT and store in the organized structure
    convert_all_epubs_to_txt(input_root, txt_output_root)

In [None]:
from booknlp.booknlp import BookNLP
from pathlib import Path
import os
import torch

# Set up your input and output directories here
#input_root = Path("/content/drive/MyDrive/full_list_of_romantic_novels/txt_books")
#output_root = Path("/content/drive/MyDrive/full_list_of_romantic_novels/booknlp_outputs")
#output_root.mkdir(parents=True, exist_ok=True)

input_root = Path("/content/drive/MyDrive/fanfiction project_texts")
output_root = Path("/content/drive/MyDrive/fanfiction project_texts")
output_root.mkdir(parents=True, exist_ok=True)


def remove_position_ids_and_save(model_file, device, save_path):
    state_dict = torch.load(model_file, map_location=device)
    if 'bert.embeddings.position_ids' in state_dict:
        print(f'Removing "position_ids" from the state dictionary of {model_file}')
        del state_dict['bert.embeddings.position_ids']
    torch.save(state_dict, save_path)
    print(f'Modified state dict saved to {save_path}')

def process_model_files(model_params, device):
    updated_params = {}
    for key, path in model_params.items():
        if isinstance(path, str) and os.path.isfile(path) and path.endswith('.model'):
            save_path = path.replace('.model', '_modified.model')
            remove_position_ids_and_save(path, device, save_path)
            updated_params[key] = save_path
        else:
            updated_params[key] = path
    return updated_params

def process_books_with_booknlp(input_root, output_root):
    user_dir = Path.home()

    # Set up model parameters with paths to BookNLP models
    model_params = {
        'pipeline': 'entity,quote,supersense,event,coref',
        'model': 'custom',
        'entity_model_path': f'{user_dir}/booknlp_models/entities_google_bert_uncased_L-6_H-768_A-12-v1.0.model',
        'coref_model_path': f'{user_dir}/booknlp_models/coref_google_bert_uncased_L-12_H-768_A-12-v1.0.model',
        'quote_attribution_model_path': f'{user_dir}/booknlp_models/speaker_google_bert_uncased_L-12_H-768_A-12-v1.0.1.model',
        'bert_model_path': f'{user_dir}/.cache/huggingface/hub/'
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_params = process_model_files(model_params, device)  # Modify model files if needed

    # Initialize BookNLP
    booknlp = BookNLP('en', model_params)

    # Iterate through each book folder in the input root directory
    for book_folder in input_root.iterdir():
        if book_folder.is_dir():
            # Find the TXT file in the book's folder
            txt_files = list(book_folder.glob("*.txt"))
            if not txt_files:
                print(f"No TXT file found in {book_folder}, skipping.")
                continue

            # Use the first TXT file found (assuming only one per folder)
            input_file = txt_files[0]
            book_id = input_file.stem.replace(' ', '_')

            # Set up the output directory for the book
            book_output_dir = output_root / f"{book_folder.name}_BookNLP_output"
            book_output_dir.mkdir(parents=True, exist_ok=True)

            # Process the book with BookNLP
            try:
                print(f"Processing {input_file} with BookNLP...")
                booknlp.process(input_file, book_output_dir, book_id)
                print(f"Processed {input_file} with BookNLP; output saved in {book_output_dir}")
            except Exception as e:
                print(f"Error processing {input_file}: {e}")

# Run the processing function
process_books_with_booknlp(input_root, output_root)