In [1]:
!pip install booknlp
!pip install transformers==4.30.0
!python -m spacy download en_core_web_sm

!wget https://www.gutenberg.org/files/1342/1342-0.txt -O pride_and_prejudice.txt


from booknlp.booknlp import BookNLP
import json
import os

from collections import defaultdict
import multiprocessing as mp

class SequentialCharacterTracker:
    def __init__(self):
        self.model_params = {
            "pipeline": "entity,quote,supersense,event,coref",
            "model": "big"
        }
        self.booknlp = BookNLP("en", self.model_params)

    def process_book(self, input_file, output_dir, book_id):
        """Process the book using BookNLP"""
        os.makedirs(output_dir, exist_ok=True)
        self.booknlp.process(input_file, output_dir, book_id)

    def get_canonical_names(self, book_data):
        """Create mapping of character IDs to their main names"""
        canonical_names = {}
        for character in book_data["characters"]:
            proper_mentions = character["mentions"]["proper"]
            if proper_mentions:
                main_name = max(proper_mentions, key=lambda x: x['c'])['n']
                canonical_names[character["id"]] = main_name
        return canonical_names

    def track_sequential_mentions(self, input_file, output_dir, book_id):
        """Create chronological tracking of character mentions"""
        self.process_book(input_file, output_dir, book_id)

        with open(f"{output_dir}{book_id}.book", 'r', encoding='utf-8') as f:
            book_data = json.load(f)

        canonical_names = self.get_canonical_names(book_data)

        entities_file = f"{output_dir}{book_id}.entities"
        tokens_file = f"{output_dir}{book_id}.tokens"

        chronological_mentions = self.process_entities(entities_file, tokens_file, canonical_names)

        self.write_output(chronological_mentions, f"{output_dir}sequential_mentions.txt")

    def process_entities(self, entities_file, tokens_file, canonical_names):
        """Process entities file and create chronological mentions"""
        chronological_mentions = defaultdict(lambda: defaultdict(list))

        with open(entities_file, 'r', encoding='utf-8') as entities_f, \
             open(tokens_file, 'r', encoding='utf-8') as tokens_f:
            next(entities_f)  # Skip header
            next(tokens_f)  # Skip header

            entities_lines = entities_f.readlines()
            tokens_lines = tokens_f.readlines()

            for entity_line in entities_lines:
                parts = entity_line.strip().split('\t')
                if len(parts) >= 6:
                    coref_id = int(parts[0])
                    start_token = int(parts[1])
                    mention_text = parts[5]

                    if coref_id in canonical_names:
                        token_line = tokens_lines[start_token]
                        token_parts = token_line.strip().split('\t')
                        paragraph_num = int(token_parts[0])
                        sentence_num = int(token_parts[1])

                        chronological_mentions[paragraph_num][sentence_num].append({
                            'character': canonical_names[coref_id],
                            'mention': mention_text
                        })

        return chronological_mentions

    def write_output(self, chronological_mentions, output_file):
        """Write the chronological mentions to the output file"""
        with open(output_file, 'w', encoding='utf-8') as f:
            for paragraph, sentences in sorted(chronological_mentions.items()):
                f.write(f"\n[Paragraph {paragraph}]\n")
                for sentence, mentions in sorted(sentences.items()):
                    f.write(f"[Sentence {sentence}]: ")
                    for mention in mentions:
                        if mention['mention']==mention['character']:
                          f.write(f"{mention['mention']}, ")
                        else:
                          f.write(f"{mention['mention']}[{mention['character']}], ")
                    f.write("\n")

if __name__ == "__main__":
    tracker = SequentialCharacterTracker()
    tracker.track_sequential_mentions(
        input_file="pride_and_prejudice.txt",
        output_dir="pride_and_prejudice/",
        book_id="pride_and_prejudice"
    )

Collecting booknlp
  Downloading booknlp-1.0.8-py3-none-any.whl.metadata (345 bytes)
Downloading booknlp-1.0.8-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: booknlp
Successfully installed booknlp-1.0.8
Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/270M [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")
  self.model.load_state_dict(torch.load(model_file, map_location=device))


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

  self.model.load_state_dict(torch.load(modelFile, map_location=device))
  self.model.load_state_dict(torch.load(modelFile, map_location=device))


--- startup: 97.297 seconds ---
--- spacy: 21.492 seconds ---
--- entities: 129.385 seconds ---
--- quotes: 0.270 seconds ---
--- attribution: 48.895 seconds ---
--- name coref: 0.535 seconds ---
--- coref: 53.557 seconds ---
--- TOTAL (excl. startup): 254.890 seconds ---, 152569 words
