In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import requests
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrerofeevskij/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrerofeevskij/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrerofeevskij/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Download text

In [2]:
link = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(link)
original_text = response.text

# Preprocessing

## Convert to lower case & remove non-alphabetic characters

In [3]:
text = original_text.lower()
text = re.sub(r"[^a-z ]", "", text)

## Remove stop words

In [4]:
token_list = word_tokenize(text)

In [5]:
token_list = [word for word in token_list if word not in stopwords.words('english')]

## Lemmatize

In [6]:
lemmatizer = WordNetLemmatizer()
token_list = [lemmatizer.lemmatize(word) for word in token_list]

In [7]:
text = " ".join(token_list)

# Top 10 most important words

In [8]:
text_splitted = text.split('chapter')
text_split_by_chapters = text_splitted[13:]

In [14]:
vectorizer = TfidfVectorizer(input='content', analyzer='word').fit(text_split_by_chapters)

In [15]:
top_words = []
for chapter_text in text_split_by_chapters:
    matr = vectorizer.transform([chapter_text])
    features = vectorizer.get_feature_names_out()
    top_words_by_chapter = [features[i] for i in matr.sum(axis=0).argsort()[0, -11:][0]]
    top_words.append(top_words_by_chapter[0])

In [17]:
for i, chapter_top_words in enumerate(top_words):
    print(i+1, ", ".join([word for word in chapter_top_words[0] if word != 'alice'][:10]))

1 rabbit, way, one, see, door, eat, like, little, key, bat
2 mabel, oh, cried, said, cat, dear, swam, pool, little, mouse
3 dinah, know, course, thimble, dry, prize, lory, dodo, said, mouse
4 window, ann, one, fan, said, bottle, little, bill, rabbit, puppy
5 im, caterpillarwell, father, size, egg, youth, serpent, pigeon, caterpillar, said
6 much, mad, cook, like, pig, duchess, baby, footman, cat, said
7 time, asleep, draw, tea, twinkle, hare, march, said, hatter, dormouse
8 five, head, executioner, hedgehog, gardener, soldier, cat, king, said, queen
9 school, dont, went, queen, moral, duchess, gryphon, turtle, said, mock
10 soup, wont, whiting, beautiful, join, lobster, said, gryphon, turtle, mock
11 march, rabbit, thecourt, juror, queen, dormouse, court, witness, hatter, said
12 copyright, said, copy, state, term, gutenberg, electronic, foundation, work, gutenbergtm


## Predicted names
1. The rabbit sees the door
2. Cat speak with little mouse
3. Dodo and the mouse
4. The window
5. Speaking animals
6. Baby pig
7. Time to sleep
8. Executioner with five heads
9. It's better than school
10. Lobster's soup
11. The rabbit's march
12. Conversations

## Original names

In [117]:
chapters_names = original_text.split('CHAPTER')[1:13]

for i, chapter_name in enumerate(chapters_names):
    print(f"{i+1}. {' '.join((chapter_name.split()[1:]))}")

1. Down the Rabbit-Hole
2. The Pool of Tears
3. A Caucus-Race and a Long Tale
4. The Rabbit Sends in a Little Bill
5. Advice from a Caterpillar
6. Pig and Pepper
7. A Mad Tea-Party
8. The Queenâs Croquet-Ground
9. The Mock Turtleâs Story
10. The Lobster Quadrille
11. Who Stole the Tarts?
12. Aliceâs Evidence


# Top 10 verbs with Alice

In [18]:
all_sentences = original_text.lower().split(".")
all_sentences = [re.sub(r"[^a-z ]", "", sentence) for sentence in all_sentences]

In [19]:
prepared_sentences = []

In [20]:
for sentence in all_sentences:
    token_list = word_tokenize(sentence)
    token_list = [word for word in token_list if word not in stopwords.words('english')]
    token_list = [lemmatizer.lemmatize(word) for word in token_list]
    prepared_sentences.append(" ".join(token_list))

In [21]:
alice_verbs = {}
for sentence in prepared_sentences:
    if 'alice' in sentence:
        tokens = word_tokenize(sentence)
        words_with_type = nltk.pos_tag(tokens)
        verbs = [word for word, type_word in words_with_type if type_word[:2] == 'VB']
        for verb in verbs:
            if verb in alice_verbs:
                alice_verbs[verb] += 1
            else:
                alice_verbs[verb] = 1

In [28]:
sorted_alice_verbs = {k: v for k, v in sorted(alice_verbs.items(), key=lambda item: item[1], reverse=True)}

In [31]:
for i, (k, v) in enumerate(sorted_alice_verbs.items()):
    if i == 10:
        break
    print(f"Alice {k} {v} times")

Alice said 199 times
Alice went 39 times
Alice thought 36 times
Alice say 31 times
Alice go 26 times
Alice know 25 times
Alice see 24 times
Alice got 23 times
Alice began 23 times
Alice looked 22 times


Usually Alice speaks and goes