In [3]:
!pip install booknlp
!pip install transformers==4.30.0
!python -m spacy download en_core_web_sm

!wget https://www.gutenberg.org/files/1342/1342-0.txt -O pride_and_prejudice.txt

import re
from booknlp.booknlp import BookNLP
import json
import os
from collections import defaultdict
import multiprocessing as mp

# Preprocess function to remove unwanted sections from the text
def preprocess_text_remove_intro_ending(input_file, output_file):
    """
    Preprocess the book text by removing the introduction, table of contents, ending sections,
    text with leading spaces, and text enclosed in square brackets or between underscores.
    Removes unwanted punctuation but keeps sentence-ending punctuation marks.
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove sentences between underscores but keep words between underscores
    text = re.sub(r'_\s*[^_]*[.?!]\s*_','', text)

    # Remove text enclosed in square brackets, including multiline content (e.g., [Illustration: ...])
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)

    # Remove unwanted punctuation but keep sentence-ending punctuation (., !, ?)
    text = re.sub(r"[\"'“”‘’\]\[\(\){}]", '', text)

    # Print and remove lines with leading spaces
    removed_lines = [line for line in text.splitlines() if '      ' in line]
    print("Lines being removed due to leading spaces:")
    for line in removed_lines:
        print(line)

    # Remove lines with leading spaces (non-content or formatted text)
    text = '\n'.join([line for line in text.splitlines() if '      ' not in line])

    # Identify the start of the main content by looking for the first meaningful paragraph
    paragraphs = text.split('\n\n')
    main_content_index = -1

    for i, paragraph in enumerate(paragraphs):
        # Check if the paragraph is a valid start (contains more than one word and looks like a complete sentence)
        if re.match(r'^[A-Z][^?!.]*[.?!]$', paragraph.strip(), re.MULTILINE) and len(paragraph.split()) > 5:
            main_content_index = i
            break

    if main_content_index != -1:
        # Retain only the content from the first main paragraph onward
        text = '\n\n'.join(paragraphs[main_content_index:])

    # Identify the ending marker using non-content blocks (e.g., multiple empty lines or formatting markers)
    # Ensure these blocks do not include valid content (like "chapter")
    match = re.search(r'(\n\s*\n\s*){3,}', text, flags=re.DOTALL)
    if match:
        surrounding_text = text[max(0, match.start() - 100):match.start() + 100].lower()
        if 'chapter' not in surrounding_text:
            text = text[:match.start()] # Remove content after the non-content block

    # Remove specific project markers (e.g., "*** END OF THE PROJECT GUTENBERG")
    text = re.sub(r'\*\*\*.*?\*\*\*', '', text, flags=re.DOTALL)

    # Trim empty lines at the start and end of the book
    text = text.strip()

    # Clean up any extra newlines or spaces for a cleaner output
    text = re.sub(r'\n\s*\n', '\n\n', text) # Maintain paragraphs with double newlines
    text = re.sub(r'[ ]+', ' ', text)  # Normalize spaces
    text = re.sub(r'_', '', text) # Remove just the "_" symbols while keeping the words intact
    text = re.sub(r'-', '', text)  # Remove hyphens

    with open(output_file, 'w', encoding='utf-8') as out_f:
        out_f.write(text)

    return output_file

class SequentialCharacterTracker:
    def __init__(self):
        self.model_params = {
            "pipeline": "entity,quote,supersense,event,coref",
            "model": "big"
        }
        self.booknlp = BookNLP("en", self.model_params)

    def process_book(self, input_file, output_dir, book_id):
        """Process the book using BookNLP"""
        os.makedirs(output_dir, exist_ok=True)
        self.booknlp.process(input_file, output_dir, book_id)

    def get_canonical_names(self, book_data):
        """Create mapping of character IDs to their main names"""
        canonical_names = {}
        for character in book_data["characters"]:
            proper_mentions = character["mentions"]["proper"]
            if proper_mentions:
                main_name = max(proper_mentions, key=lambda x: x['c'])['n']
                canonical_names[character["id"]] = main_name
        return canonical_names

    def track_sequential_mentions(self, input_file, output_dir, book_id):
        """Preprocess the input file and create chronological tracking of character mentions"""
        # Preprocess the book before using BookNLP
        preprocessed_file = preprocess_text_remove_intro_ending(input_file, f"preprocessed_{input_file}")

        # Process the preprocessed book
        self.process_book(preprocessed_file, output_dir, book_id)

        with open(f"{output_dir}{book_id}.book", 'r', encoding='utf-8') as f:
            book_data = json.load(f)

        canonical_names = self.get_canonical_names(book_data)
        entities_file = f"{output_dir}{book_id}.entities"
        tokens_file = f"{output_dir}{book_id}.tokens"

        chronological_mentions = self.process_entities(entities_file, tokens_file, canonical_names)
        self.write_output(chronological_mentions, f"{output_dir}sequential_mentions.txt")

    def process_entities(self, entities_file, tokens_file, canonical_names):
        """Process entities file and create chronological mentions"""
        chronological_mentions = defaultdict(lambda: defaultdict(list))

        with open(entities_file, 'r', encoding='utf-8') as entities_f, \
             open(tokens_file, 'r', encoding='utf-8') as tokens_f:
            next(entities_f)  # Skip header
            next(tokens_f)  # Skip header

            entities_lines = entities_f.readlines()
            tokens_lines = tokens_f.readlines()

            for entity_line in entities_lines:
                parts = entity_line.strip().split('\t')
                if len(parts) >= 6:
                    coref_id = int(parts[0])
                    start_token = int(parts[1])
                    mention_text = parts[5]

                    if coref_id in canonical_names:
                        token_line = tokens_lines[start_token]
                        token_parts = token_line.strip().split('\t')
                        paragraph_num = int(token_parts[0])
                        sentence_num = int(token_parts[1])

                        chronological_mentions[paragraph_num][sentence_num].append({
                            'character': canonical_names[coref_id],
                            'mention': mention_text
                        })

        return chronological_mentions

    def write_output(self, chronological_mentions, output_file):
        """Write the chronological mentions to the output file"""
        with open(output_file, 'w', encoding='utf-8') as f:
            for paragraph, sentences in sorted(chronological_mentions.items()):
                f.write(f"\n[Paragraph {paragraph}]\n")
                for sentence, mentions in sorted(sentences.items()):
                    f.write(f"[Sentence {sentence}]: ")
                    for mention in mentions:
                        if mention['mention'] == mention['character']:
                            f.write(f"{mention['mention']}, ")
                        else:
                            f.write(f"{mention['mention']}[{mention['character']}], ")
                    f.write("\n")

if __name__ == "__main__":
    tracker = SequentialCharacterTracker()
    tracker.track_sequential_mentions(
        input_file="pride_and_prejudice.txt",
        output_dir="pride_and_prejudice/",
        book_id="pride_and_prejudice"
    )


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
--2024-11-11 09:55:47--  https://www.gutenberg.org/files/1342/1342-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 752575 (735K) [text/plain]
Sav



--- startup: 8.681 seconds ---
Lines being removed due to leading spaces:
                            
                            
                                PRIDE.
                                  and
                               PREJUDICE
                                  by
                             Jane Austen,
                           with a Preface by
                           George Saintsbury
                                  and
                           Illustrations by
                             Hugh Thomson
                         
                       Ruskin       156. Charing
                       House.        Cross Road.
                                London
                             George Allen.
             CHISWICK PRESS:--CHARLES WHITTINGHAM AND CO.
                  TOOKS COURT, CHANCERY LANE, LONDON.
                            
                                                                    PAGE
Frontispiece                         