In [512]:
import os
import re
import json
import nltk
import sys

In [513]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [514]:
import spacy

In [515]:
def load_book(book_path):
    """
    Function to load a book from a text file
    :param book_path: Path to the book text file
    :return: The book text as a string 
    """
    
    with open(book_path, 'r', encoding='utf-8') as book_file:
        book_text = book_file.read()
    return book_text

In [614]:
CHAPTERS = ["Chapter", "CHAPTER", "ACT"]

def split_book_by_chapter(cleaned_text, spacy_model):
    """
    Split the book into chapters
    :param cleaned_text: the text of the book with the headers and footers removed
    :return: a list of chapters
    """
    for title in CHAPTERS:
        cleaned_text = cleaned_text.replace(title, "Chapter")
    chapters = re.split(r'\bChapter\b', cleaned_text)
    add_chapter = []

    for c in chapters:
        new_chapter = "Chapter " + c[1:]
        add_chapter.append(new_chapter)
    
    chapters = [spacy_model(chapter) for chapter in add_chapter]

    return chapters

In [615]:
def perform_ner(chapters):
    """
    Function to perform named entity recognition on a text
    :param text: The text to perform NER on
    :param spacy_model: The spaCy model to use for NER
    :return: A list of named entities
    """
    
    entities = []
    for chapter in chapters:
        for ent in chapter.ents:
            if ent.label_ == "PERSON":
                entities.append(ent.text)

    return entities

In [646]:
def count_entities(entities:list)->list:
    """
    Counts the number of times each entity appears in the text
    :param text: 
    :param words: list of words to count
    :return: dictionary with words as keys and number of times they appear in the text as values
    """
    set_entities = set(entities)
    entities_tuples = {}

    for word in set_entities:
        count = entities.count(word)
        if count > 5:
            entities_tuples[word] = count

    return entities_tuples

In [617]:
def extract_chapter_numbers(text):
    match = re.search(r'\b([IVXLCDM]+|\d+(\.\d+)?)\.', text)
    if match:
        return match.group(1)
    return None

In [618]:
def extract_character_lines_with_chapters(chapters, characters):
    current_chapter = None
    character_lines = {}

    for chapter in chapters:
        for line in chapter.sents:
            line = line.text.strip()
            if line.startswith("Chapter "):
                current_chapter = extract_chapter_numbers(line)
                continue

            for character in characters:
                if character in line:
                    # Check if the character already has an entry for the current sentence
                    existing_entry = next((entry for entry in character_lines.get(character, []) if entry[1] == line), None)

                    if existing_entry is None:
                        if character not in character_lines:
                            character_lines[character] = []
                        character_lines[character].append((current_chapter, line.replace("\n", "")))
    return character_lines

In [626]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math

def extract_features(character_sentences: dict) -> list:
    """
    Calculates the TF-IDF score for each entity
    :param character_sentences: dictionary containing character sentences
    :return: list of TF-IDF vectors
    """
    # Tokenize and preprocess the sentences
    stop_words = set(stopwords.words('english'))

    def preprocess_sentence(sentence):
        tokens = re.sub(r'[^\w\s]', '', sentence)
        tokens = word_tokenize(tokens.lower())
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if word not in stop_words]
        return tokens

    document_frequency = {}
    total_documents = len(character_sentences)

    # Count document frequency for each term
    for _, sentences in character_sentences.items():
        term_set = set()
        for _, sent in sentences:
            terms = preprocess_sentence(sent)
            term_set.update(terms)
        for term in term_set:
            document_frequency[term] = document_frequency.get(term, 0) + 1

    # Calculate TF-IDF score for each term
    tfidf_vectors = []
    term_list = list(document_frequency.keys())

    for _, sentences in character_sentences.items():
        term_frequency = {}
        terms_in_character = 0

        for _, sentence in sentences:
            terms = preprocess_sentence(sentence)
            terms_in_character += len(terms)

            for term in terms:
                term_frequency[term] = term_frequency.get(term, 0) + 1

        tfidf_vector = [term_frequency.get(term, 0) / terms_in_character * math.log(total_documents / (1 + document_frequency[term]))
                        for term in term_list]
        tfidf_vectors.append(tfidf_vector)

    return tfidf_vectors, term_list


def clustering_aliases(entities:list, character_sentences:dict):
    # Perform the TF-IDF feature extraction
    features, term_list = extract_features(character_sentences)

    # Create a KMeansClusterer object with the number of clusters, cosine distance, and 10 repeats
    n_clusters = len(set(entities))
    kclusterer = nltk.cluster.KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=10)

    # Cluster the feature vectors and assign a cluster label to each entity
    clusters = kclusterer.cluster(features, assign_clusters=True)

    # Create a dictionary entities as keys and all their aliases as values
    cluster_dict = {character: [] for character in entities}

    # Create a dictionary with the cluster label as key and the entities in the cluster as values
    cluster_dict = {}
    for i, cluster in enumerate(clusters):
        if cluster not in cluster_dict:
            cluster_dict[cluster] = []
        cluster_dict[cluster].append(entities[i])

    # Convert the dictionary values to lists when keys are equal
    result_dict = {}
    for key, value in cluster_dict.items():
        if len(value) > 1:
            result_dict[key] = value
        else:
            result_dict[key] = value[0]

    return result_dict


In [529]:
nlp = spacy.load("en_core_web_lg")

In [659]:
import json

def find_start_end_indices(substring, full_text):
    start_index = full_text.find(substring)
    end_index = start_index + len(substring)
    return start_index, end_index

def create_json_structure(characters_frequency, characters_aliases, characters_occurrences):
    get_aliases ={}
    for name in characters_occurrences.keys():
        for _, value in characters_aliases.items():
            if name in value:
                get_aliases[name] = value

    json_structure = {"main_characters": []}

    for character in characters_frequency:
        aliases = get_aliases.get(character, [])
        occurrences = characters_occurrences.get(character, [])
        frequency = characters_frequency.get(character, 0)       

        character_data = {
            "name": character,
            "aliases": aliases,
            "frequency": frequency,
            "occurrences": []
        }

        for chapter, context in occurrences:
            # Assuming you have start and end indices available
            start_index, end_index = find_start_end_indices(character, context)

            occurrence_data = {
                "sentence": context,
                "chapter": chapter,
                "position": {"start": start_index, "end": end_index}
            }

            character_data["occurrences"].append(occurrence_data)

        json_structure["main_characters"].append(character_data)

    return json_structure


In [630]:
def write_as_json(book_path, resulting_json, directory_path):
    """
    Write the data to a JSON file.
    """
    # Get the book title from the file path
    book_title = os.path.basename(book_path).replace("_clean.txt", "")
    result = json.dumps(resulting_json, ensure_ascii=False, indent=2)

    # Create the output file path
    file_name = f"{book_title}_MainCharacters_NER.json"
    output_file_path = os.path.join(directory_path, file_name)

    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json_file.write(result)

In [660]:

book_path = "/Users/dariastetsenko/Desktop/pcl1/Programming-for-Linguists-Project/Alice_in_wonderland/Alice_in_wonderland_clean.txt"

book_text = load_book(book_path)
chapters = [chapter for chapter in split_book_by_chapter(book_text, nlp) if len(chapter) > 1000]
# book_cleaned = "".join(chapters)

# Characters in the book
extract_entities = perform_ner(chapters)
# # # Characters frequency in the book
characters_frequency = count_entities(extract_entities)
# # # Get the most frequent characters
# main_characters_frequencies = sorted(characters_frequency.items(), key=lambda x: x[1], reverse=True)[:5]
main_characters = list(characters_frequency.keys())

# # # # Characters with their lines and chapters
characters_occurrences = extract_character_lines_with_chapters(chapters, main_characters)
# # # Characters with their aliases
characters_aliases = clustering_aliases(main_characters, characters_occurrences)

resulting_json = create_json_structure(characters_frequency, characters_aliases, characters_occurrences)


In [661]:
resulting_json

{'main_characters': [{'name': 'Hatter',
   'aliases': 'Hatter',
   'frequency': 55,
   'occurrences': [{'sentence': '“What sort of people live about here?”“In _that_ direction,” the Cat said, waving its right paw round, “livesa Hatter: and in _that_ direction,” waving the other paw, “lives aMarch Hare.',
     'chapter': 'VI',
     'position': {'start': 111, 'end': 117}},
    {'sentence': 'I almost wish I’d gone to see the Hatter instead!”',
     'chapter': 'VI',
     'position': {'start': 34, 'end': 40}},
    {'sentence': 'A Mad Tea-PartyThere was a table set out under a tree in front of the house, and theMarch Hare and the Hatter were having tea at it: a Dormouse was sittingbetween them, fast asleep, and the other two were using it as acushion, resting their elbows on it, and talking over its head.',
     'chapter': 'VII',
     'position': {'start': 103, 'end': 109}},
    {'sentence': '“Your hair wants cutting,” said the Hatter.',
     'chapter': 'VII',
     'position': {'start': 36, 