In [35]:
import warnings
warnings.filterwarnings('ignore')

In [84]:
import requests
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrerofeevskij/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrerofeevskij/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrerofeevskij/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Download text

In [104]:
link = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(link)
original_text = response.text

# Preprocessing

## Convert to lower case & remove non-alphabetic characters

In [30]:
text = original_text.lower()
text = re.sub(r"[^a-z ]", "", text)

## Remove stop words

In [44]:
token_list = word_tokenize(text)

In [49]:
token_list = [word for word in token_list if word not in stopwords.words('english')]

## Lemmatize

In [52]:
lemmatizer = WordNetLemmatizer()
token_list = [lemmatizer.lemmatize(word) for word in token_list]

In [54]:
text = " ".join(token_list)

# Top 10 most important words

In [130]:
text_splitted = text.split('chapter')
text_split_by_chapters = text_splitted[13:]

In [131]:
vectorizer = TfidfVectorizer(input='content', analyzer='word').fit(text_split_by_chapters)

In [165]:
top_words = []
for chapter_text in text_split_by_chapters:
    vectorizer = TfidfVectorizer(input='content', analyzer='word')
        
    matr = vectorizer.fit_transform([chapter_text])
    features = vectorizer.get_feature_names_out()
    top_words_by_chapter = [features[i] for i in matr.sum(axis=0).argsort()[0, -11:][0]]
    top_words.append(top_words_by_chapter[0])

In [184]:
for i, chapter_top_words in enumerate(top_words):
    print(i+1, ", ".join([word for word in chapter_top_words[0] if word != 'alice'][:10]))

1 could, nothing, door, thought, think, way, one, see, like, little
2 im, one, must, went, foot, dear, thing, said, mouse, little
3 question, lory, one, soon, long, thing, know, dodo, mouse, said
4 voice, get, bill, thought, heard, quite, one, rabbit, said, little
5 bit, dont, serpent, ive, size, pigeon, im, little, caterpillar, said
6 would, baby, went, little, much, footman, duchess, like, cat, said
7 say, went, thing, know, time, hare, march, dormouse, hatter, said
8 began, went, three, two, see, cat, king, head, queen, said
9 moral, say, dont, queen, went, gryphon, duchess, turtle, mock, said
10 could, join, lobster, beautiful, wont, would, gryphon, turtle, mock, said
11 witness, thought, court, rabbit, dormouse, queen, one, hatter, king, said
12 king, state, copy, term, electronic, gutenberg, foundation, gutenbergtm, said, work


## Predicted names
1. The thinking door
2. Little mouse must go
3. Lory's soon long thing
4. Talking rabbit
5. Serpent, pigeon and caterpilalr
6. Little duchess's footman
7. The hatter teaches the hare
8. The king and the queen
9. The moral
10. The gryphon and the turtle
11. Hatter-witness
12. King's copy

## Original names

In [117]:
chapters_names = original_text.split('CHAPTER')[1:13]

for i, chapter_name in enumerate(chapters_names):
    print(f"{i+1}. {' '.join((chapter_name.split()[1:]))}")

1. Down the Rabbit-Hole
2. The Pool of Tears
3. A Caucus-Race and a Long Tale
4. The Rabbit Sends in a Little Bill
5. Advice from a Caterpillar
6. Pig and Pepper
7. A Mad Tea-Party
8. The Queenâs Croquet-Ground
9. The Mock Turtleâs Story
10. The Lobster Quadrille
11. Who Stole the Tarts?
12. Aliceâs Evidence


# Top 10 verbs with Alice

In [193]:
all_sentences = original_text.lower().split(".")
all_sentences = [re.sub(r"[^a-z ]", "", sentence) for sentence in all_sentences]

In [198]:
prepared_sentences = []

In [199]:
for sentence in all_sentences:
    token_list = word_tokenize(sentence)
    token_list = [word for word in token_list if word not in stopwords.words('english')]
    token_list = [lemmatizer.lemmatize(word) for word in token_list]
    prepared_sentences.append(" ".join(token_list))

In [214]:
alice_verbs = {}
for sentence in prepared_sentences:
    if 'alice' in sentence:
        tokens = word_tokenize(sentence)
        words_with_type = nltk.pos_tag(tokens)
        verbs = [word for word, type_word in words_with_type if type_word == 'VB']
        for verb in verbs:
            if verb in alice_verbs:
                alice_verbs[verb] += 1
            else:
                alice_verbs[verb] = 1

In [215]:
alice_verbs

{'restrictionswhatsoever': 1,
 'get': 11,
 'thehot': 1,
 'worth': 1,
 'take': 3,
 'see': 16,
 'let': 5,
 'toget': 1,
 'belong': 1,
 'open': 1,
 'go': 15,
 'little': 1,
 'find': 3,
 'drink': 2,
 'say': 7,
 'wise': 1,
 'sort': 1,
 'shrink': 1,
 'end': 1,
 'reachit': 1,
 'come': 14,
 'use': 4,
 'make': 3,
 'feel': 2,
 'manage': 1,
 'bekind': 1,
 'ill': 3,
 'give': 3,
 'tear': 1,
 'glove': 3,
 'beenchanged': 2,
 'improve': 1,
 'tailand': 1,
 'clawsand': 1,
 'mabel': 1,
 'andi': 1,
 'stay': 2,
 'look': 2,
 'im': 1,
 'put': 1,
 'declare': 1,
 'rightway': 1,
 'offended': 1,
 'talk': 1,
 'hear': 5,
 'kill': 1,
 'tell': 8,
 'know': 3,
 'allowwithout': 1,
 'catch': 1,
 'tospeak': 1,
 'think': 3,
 'please': 1,
 'happen': 4,
 'dinah': 1,
 'stop': 1,
 'fanciedshe': 1,
 'thought': 1,
 'roof': 1,
 'burn': 1,
 'found': 1,
 'eat': 2,
 'keep': 5,
 'hold': 2,
 'seehow': 1,
 'begin': 1,
 'knowand': 1,
 'shecould': 1,
 'wait': 1,
 'remember': 5,
 'injure': 1,
 'lookout': 1,
 'need': 1,
 'neck': 1,
 'kept':

Usually Alice goes and sees