In [1]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

# nltk.download('stopwords')
# nltk.download("wordnet")
# nltk.download("punkt")
# nltk.download("averaged_perceptron_tagger_eng")


### Preprocessing

In [40]:
file_name = '11-0.txt'
with open(file_name, 'r') as file:
    content = file.read()
    #print(content)

text = content.splitlines()[31:3380]
text = '\n'.join(text)

In [3]:
def clean_text(text):
    '''
    preprocessing: 
    - converting to lower case
    - removing stop words and special symbols 
    - lemmatization
    '''
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words("english")

    text = text.replace("\r", " ").replace("\n", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text).strip()

    text = re.sub(r"[^\w\s]|_", "", text, flags=re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    text = text.strip()
    return text

In [4]:
def split_by_chapter(chapter_titles: list, text):
    '''
    splitting the text into a list in chapters
    '''
    current_text = text

    chapters = []
    for title in chapter_titles[::-1]: 
        current_text, current_chapter = current_text.split(title, 1)
        chapters.append(current_chapter)
        
    return chapters[::-1]

chapter_titles = ['CHAPTER ' + i for i in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII']]

chapter_content = split_by_chapter(chapter_titles, text)
chapter_content = [clean_text(chapter) for chapter in chapter_content]

### Top-10 most important words with TF-IDF
#### And Title as top-4 words

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tftidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
vectorizer_tftidf.fit(chapter_content)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [6]:
X = vectorizer_tftidf.transform(chapter_content)
feature_names = vectorizer_tftidf.get_feature_names_out()

def top_n_words(row_sparse, feature_names, n=10, exclude_set=set()):
    row = row_sparse.toarray().ravel()
    
    sorted_idx = np.argsort(row)[::-1]
    top = []
    for i in sorted_idx:
        token = feature_names[i]
        if token in exclude_set:
            continue
        top.append((token, row[i]))
        if len(top) >= n:
            break
    return top

exclude = {'alice', 'wa'}
for i, chapter in enumerate(chapter_content):
    top = top_n_words(X[i], feature_names, n=10, exclude_set=exclude)
    print(f"Chapter {i+1} top words:", top)
    
    title_candidate = " ".join([w for w,_ in top[:4]])
    print("Candidate title:", title_candidate)
    print("---")

Chapter 1 top words: [('little', 0.1422412528086238), ('bat', 0.1403259498054104), ('door', 0.12678029566427484), ('key', 0.12395784644696138), ('eat', 0.11770301721262655), ('think', 0.10431025205965744), ('way', 0.10431025205965744), ('like', 0.10431025205965744), ('either', 0.10088830046796561), ('see', 0.09482750187241586)]
Candidate title: little bat door key
---
Chapter 2 top words: [('mouse', 0.2767937008398083), ('pool', 0.1700432602543522), ('little', 0.1658555065612759), ('im', 0.14801826489122372), ('swam', 0.140089471998173), ('cat', 0.13839685041990415), ('dear', 0.1354747476487759), ('said', 0.11707447521972417), ('foot', 0.11386020376247978), ('mabel', 0.11207157759853842)]
Candidate title: mouse pool little im
---
Chapter 3 top words: [('mouse', 0.3785870765843814), ('said', 0.3456767428088514), ('dodo', 0.3009018933914972), ('prize', 0.1751850392651843), ('lory', 0.1504509466957486), ('dry', 0.13290182873640183), ('thimble', 0.11679002617678953), ('know', 0.11183659326

### Top-10 most used verbs

In [None]:
def is_alphabetic(token):
    return re.match(r"^[a-zA-Z]+$", token) is not None

# verbs to exclude
aux_verbs = {
    "be", #"am", "is", "are", "was", "were", "been", "being",
    "do", #"does", "did", 
    "have", #"has", "had", 
    #"’", "“", "”", "s"
}

sentences = nltk.sent_tokenize(text)
alice_sentences = [
    s for s in sentences 
    if "alice" in s.lower()
]

verbs = []

lemmatizer = nltk.WordNetLemmatizer()

for sent in alice_sentences:
    tokens = nltk.word_tokenize(sent)
    tokens = [t for t in tokens if is_alphabetic(t)]
    tagged = nltk.pos_tag(tokens)
    
    for word, tag in tagged:
        if tag.startswith("VB"):
            lemma = lemmatizer.lemmatize(word.lower(), pos='v')
            if lemma not in aux_verbs:
                verbs.append(lemma)

verb_count = Counter(verbs)

print('Top-10 mostly common verbs in sentences with Alice')
print(verb_count.most_common(10))

print(f'Mostly often Alice : {verb_count.most_common(1)[0][0]}')

Top-10 mostly common verbs in sentences with Alice
[('say', 294), ('go', 93), ('think', 68), ('get', 66), ('look', 53), ('know', 47), ('begin', 42), ('see', 41), ('come', 37), ('make', 35)]
Mostly often Alice : say
