In [86]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# nltk.download('stopwords')
# nltk.download("wordnet")

### Preprocessing

In [87]:
file_name = '11-0.txt'
with open(file_name, 'r') as file:
    content = file.read()
    print(content)

text = content.splitlines()[31:3380]
text = '\n'.join(text)

*** START OF THE PROJECT GUTENBERG EBOOK 11 ***

[Illustration]




Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Ground
 CHAPTER IX.    The Mock Turtle’s Story
 CHAPTER X.     The Lobster Quadrille
 CHAPTER XI.    Who Stole the Tarts?
 CHAPTER XII.   Alice’s Evidence




CHAPTER I.
Down the Rabbit-Hole


Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”

So she was considering in her

In [88]:
def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words("english")

    text = text.replace("\r", " ").replace("\n", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text).strip()

    text = re.sub(r"[^\w\s]|_", "", text, flags=re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    text = text.strip()
    return text

In [89]:
def split_by_chapter(chapter_titles: list, text):
    current_text = text

    chapters = []
    for title in chapter_titles[::-1]: 
        current_text, current_chapter = current_text.split(title, 1)
        chapters.append(current_chapter)
        
    return chapters[::-1]

chapter_titles = ['CHAPTER ' + i for i in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII']]

chapter_content = split_by_chapter(chapter_titles, text)
chapter_content = [clean_text(chapter) for chapter in chapter_content]

chapter_content

['rabbithole alice wa beginning get tired sitting sister bank nothing twice peeped book sister wa reading picture conversation use book thought alice without picture conversation wa considering mind well could hot day made feel sleepy stupid whether pleasure making daisychain would worth trouble getting picking daisy suddenly white rabbit pink eye ran close wa nothing remarkable alice think much way hear rabbit say oh dear oh dear shall late thought afterwards occurred ought wondered time seemed quite natural rabbit actually took watch waistcoatpocket looked hurried alice started foot flashed across mind never seen rabbit either waistcoatpocket watch take burning curiosity ran across field fortunately wa time see pop large rabbithole hedge another moment went alice never considering world wa get rabbithole went straight like tunnel way dipped suddenly suddenly alice moment think stopping found falling deep well either well wa deep fell slowly plenty time went look wonder wa going happe

### Top-10 most important words with TF-IDF

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tftidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
vectorizer_tftidf.fit(chapter_content)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


Сейчас происходит отбор лучших слов в документе, может стоит ввести такую метрику и по ней отбирать слова:
$$f(word, document) = \frac{\text{tf-idf(word, document)}}{\prod_{doc=1}^{n}\text{tf-idf(word, doc)}}$$

In [93]:
X = vectorizer_tftidf.transform(chapter_content)
feature_names = vectorizer_tftidf.get_feature_names_out()

def top_n_words(row_sparse, feature_names, n=10, exclude_set=set()):
    row = row_sparse.toarray().ravel()
    
    sorted_idx = np.argsort(row)[::-1]
    top = []
    for i in sorted_idx:
        token = feature_names[i]
        if token in exclude_set:
            continue
        top.append((token, row[i]))
        if len(top) >= n:
            break
    return top

exclude = {'alice', 'wa'}
for i, chapter in enumerate(chapter_content):
    top = top_n_words(X[i], feature_names, n=10, exclude_set=exclude)
    print(f"Chapter {i+1} top words:", top)
    
    title_candidate = " ".join([w for w,_ in top[:4]])
    print("Candidate title:", title_candidate)
    print("---")

Chapter 1 top words: [('little', 0.12662262024425627), ('bat', 0.12491762482246493), ('door', 0.11285933521656055), ('key', 0.11034680169800794), ('eat', 0.10477877659141359), ('eat bat', 0.09696936438998366), ('think', 0.09285658817912128), ('way', 0.09285658817912128), ('like', 0.09285658817912128), ('either', 0.08981037993549737)]
Candidate title: little bat door key
---
Chapter 2 top words: [('mouse', 0.24051033167086375), ('pool', 0.14775322125498114), ('little', 0.1441144172409383), ('im', 0.12861536181756208), ('swam', 0.12172591092806789), ('cat', 0.12025516583543187), ('dear', 0.11771610550086471), ('said', 0.10172782393477997), ('foot', 0.09893489370581698), ('mabel', 0.09738072874245432)]
Candidate title: mouse pool little im
---
Chapter 3 top words: [('mouse', 0.3268715015719819), ('said', 0.2984567698397303), ('dodo', 0.2597982334899088), ('said mouse', 0.22688174153031745), ('prize', 0.15125449435354496), ('lory', 0.1298991167449544), ('said dodo', 0.12604541196128746), (

### Top-10 most used verbs