In [1]:
import os 

In [3]:
NER_CARROLL = ["Alice", "Wonderland", "Rabbit", "Hatter", "Cat", "Queen", "King", "Turtle", "Duchess", "Caterpillar"]
NER_CARROLL = [x.lower() for x in NER_CARROLL]

NER_SHAKESPEAR = ["BENEDICK", 
"LEONATO",
"ANTONIO",  
"BALTHASAR",
"BORACHIO",
"CONRADE",
"DOGBERRY",
"VERGES",
"FRIAR",
"FRANCIS",
"Sexton",
"Boy",
"HERO",
"BEATRICE",
"MARGARET",
"URSULA"]
NER_SHAKESPEAR = [x.lower() for x in NER_SHAKESPEAR]

NER_WILDE = ["John", "Worthing",
"Algernon", "Moncrieff",
"Chasuble",
"Canon",
"Merriman",
"Butler",
"Lane",
"Manservant",
"Bracknell",
"Gwendolen",
"Fairfax",
"Cardew",
"Cecily",
"Prism",
"Governess"]
NER_WILDE = [x.lower() for x in NER_WILDE]

SENT_CARROL = ["joy",
"happy",
"amusing",
"loving",
"funny",
"dear",
"rude",
"angrily",
"angry",
"dull",
"cry",
"sadly",
"sad",
]
SENT_CARROL = [x.lower() for x in SENT_CARROL]

SENT_SHAKESPEAR = ["joy",
"love",
"loving",
"happy",
"modest",
"kindness",
"mock",
"hate",
"torture",
"death",
"bastard"]
SENT_SHAKESPEAR = [x.lower() for x in SENT_SHAKESPEAR]

SENT_WILDE = ["love",
"loving",
"interesting",
"darling",
"attractive",
"unpleasant",
"hate",
"vulgar",
"terrible",
"painful"]
SENT_WILDE = [x.lower() for x in SENT_WILDE]

In [9]:
def count_words(text:list, words:list)->dict:
    """
    Counts the number of times each word appears in the text
    :param text: list of words
    :param words: list of words to count
    :return: dictionary with words as keys and number of times they appear in the text as values
    """

    words_count_per_text = {}

    for word in text:
        if word in words:
            if word not in words_count_per_text:
                words_count_per_text[word] = 1
            else:
                words_count_per_text[word] += 1

    return words_count_per_text

In [13]:
# read files from the directory and return a list of texts 
import re
def read_files(directory)->dict:
    """
    Reads all the files in the directory and counts the number of times each word appears in each text
    :param directory: directory with the texts
    :param count_words: function to count the words
    :return: sorted dictionary with a number of the text and the counts of each word 
    """

    texts_vocab = {}
    number_pattern = re.compile(r'\d+')

    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as f:
                text = f.read().lower()
                text = re.sub(r'[^\w\s]', '', text).split()
                number = int("".join(number_pattern.findall(filename)))
                texts_vocab[number] = text

    return {key: texts_vocab[key] for key in sorted(texts_vocab)}

In [45]:
alice = read_files("/Users/dariastetsenko/Desktop/pcl1/Programming-for-Linguists-Project/Alice_in_wonderland/Sentiment")
ado = read_files("/Users/dariastetsenko/Desktop/pcl1/Programming-for-Linguists-Project/Much_ado_about_nothing/Sentiment")
earnest = read_files("/Users/dariastetsenko/Desktop/pcl1/Programming-for-Linguists-Project/Importance_of_being_earnest/Sentiment")

In [186]:
import json
def json_conversion(data, count_words=count_words, words=NER_CARROLL):
    """
    Convert the data to separate JSON strings for each chapter.
    """
    chapter_json_strings = {}

    for key, value in data.items():
        # Calculate word counts for each chapter
        chapter_counts = count_words(value, words)

        # Format data for the current chapter
        formatted_data = {f"Chapter {key}": chapter_counts}

        # Convert the formatted data to JSON format
        json_data = json.dumps(formatted_data, indent=2)

        # Store the JSON string for the current chapter
        chapter_json_strings[key] = json_data

    return chapter_json_strings

In [216]:
def write_as_json(data, file_path):
    # Get the book title from the file path
    book_title = os.path.splitext(os.path.basename(file_path))[0]
    print(book_title)

    for chapter, json_string in data.items():
        chapter_name = f"Chapter{chapter}"
        output_file_path = f"{book_title}_{chapter_name}_Entities.json"
        # with open(output_file_path, "w") as f:
        #     f.write(json_string)

In [50]:
for key, value in ado.items():
    print(f"Chapter{key}", count_words(value, SENT_SHAKESPEAR))

Chapter1 {'joy': 4, 'modest': 2, 'kindness': 1, 'happy': 1, 'love': 12, 'loving': 1, 'mock': 2, 'death': 1}
Chapter2 {'modest': 2, 'love': 27, 'joy': 4, 'happy': 2, 'death': 1, 'loving': 2}
Chapter3 {'love': 20, 'mock': 1, 'death': 2, 'loving': 2, 'kindness': 1, 'joy': 1}
Chapter4 {'modest': 1, 'love': 12, 'death': 3, 'hate': 1, 'torture': 1, 'bastard': 1, 'happy': 1}
Chapter5 {'joy': 1, 'death': 10, 'hate': 1, 'love': 19, 'bastard': 1, 'loving': 1}


In [20]:
# plot chapters against the number of times each word appears in the chapter
import matplotlib.pyplot as plt
def plot_chapters(data, words=NER_CARROLL):
    """
    Plot the number of times each word appears in each chapter.
    """
    # # Get the book title from the file path
    # book_title = os.path.splitext(os.path.basename(file_path))[0]

    # Create a figure and axes for each word
    fig, axes = plt.subplots(nrows=len(words), ncols=1, figsize=(10, 10))

    # Plot the data for each word
    for i, word in enumerate(words):
        # Get the data for the current word
        word_data = [chapter_data[word] for chapter_data in data.values()]

        # Plot the data for the current word
        axes[i].plot(word_data)

        # Set the title for the current word
        axes[i].set_title(word)

        # Set the x-axis label for the current word
        axes[i].set_xlabel("Chapter")

        # Set the y-axis label for the current word
        axes[i].set_ylabel("Number of times")

    # Adjust the spacing between subplots
    fig.tight_layout()

    # Save the plot
    # output_file_path = f"{book_title}_Entities.png"
    # plt.savefig(output_file_path)

    # Show the plot
    plt.show()

In [51]:
total = {}
for key, value in ado.items():
    counts = count_words(value, SENT_SHAKESPEAR)
    for k, v in counts.items():
        if k not in total:
            total[k] = v
        else:
            total[k] += v
print(total)

{'joy': 10, 'modest': 5, 'kindness': 2, 'happy': 4, 'love': 90, 'loving': 6, 'mock': 3, 'death': 17, 'hate': 2, 'torture': 1, 'bastard': 2}
