# Lab 2. NLP 

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os

from nltk import pos_tag
from collections import Counter

# Part I & II

1. Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt
2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Function to preprocess text
def preprocess_text(file_path):
    # Load the stopwords
    stop_words = set(stopwords.words('english'))
    
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Convert text to lower case
    text = text.lower()

    # Remove non-alphabetic characters
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Tokenize the text
    word_tokens = word_tokenize(text)

    # Perform POS tagging
    pos_tagged = nltk.pos_tag(word_tokens)

    # Lemmatize words with POS tags
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos_tag)) if get_wordnet_pos(pos_tag) else word
        for word, pos_tag in pos_tagged
        if word not in stop_words
    ]

    # Join the words back into one string separated by space
    preprocessed_text = ' '.join(lemmatized_words)

    return preprocessed_text

file_path = '/Users/andreiivlev/Desktop/ITMO/MLT/task2/Alice.txt'

preprocessed_text = preprocess_text(file_path)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreiivlev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreiivlev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreiivlev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andreiivlev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
preprocessed_text

'project gutenberg ebook alices adventure wonderland lewis carroll ebook use anyone anywhere united state part world cost almost restriction whatsoever may copy give away reuse term project gutenberg license include ebook online wwwgutenbergorg locate united state check law country locate use ebook title alices adventure wonderland author lewis carroll release date january ebook recently update october language english character set encode utf produce arthur dibianca david widger start project gutenberg ebook alices adventure wonderland illustration alices adventure wonderland lewis carroll millennium fulcrum edition content chapter rabbithole chapter ii pool tear chapter iii caucusrace long tale chapter iv rabbit send little bill chapter v advice caterpillar chapter vi pig pepper chapter vii mad teaparty chapter viii queen croquetground chapter ix mock turtle story chapter x lobster quadrille chapter xi steal tart chapter xii alices evidence chapter rabbithole alice begin get tired si

# Part III

3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?

In [4]:
preprocessed_text.count('chapter')

24

Word 'chapter' occurrs 24 times: 12 times for the table of contents and 12 times at the beginning of each chapter. 

For the chapter analysis we also need to drop everything after the end of the last chapter

The end of the last chapter can be found by locating two consecutive 'end' words: 'end end' in the preprocessed text. 

In [5]:
preprocessed_text.count('end end')

1

In [6]:
preprocessed_text_ch = preprocessed_text.split('end end')[0]

In [7]:
# Split to chapters
chapters = preprocessed_text_ch.split('chapter')

Now delete first 13 elements corresponding to the table of contents and everything prior to it

In [8]:
chapters = chapters[13:]
chapters

[' rabbithole alice begin get tired sit sister bank nothing twice peep book sister read picture conversation use book thought alice without picture conversation consider mind well could hot day make feel sleepy stupid whether pleasure make daisychain would worth trouble get pick daisy suddenly white rabbit pink eye run close nothing remarkable alice think much way hear rabbit say oh dear oh dear shall late think afterwards occur ought wonder time seem quite natural rabbit actually take watch waistcoatpocket look hurry alice start foot flash across mind never see rabbit either waistcoatpocket watch take burn curiosity run across field fortunately time see pop large rabbithole hedge another moment go alice never consider world get rabbithole go straight like tunnel way dip suddenly suddenly alice moment think stop find fall deep well either well deep fell slowly plenty time go look wonder go happen next first try look make come dark see anything look side well notice fill cupboard booksh

In [9]:
exclude_words = ['alice', 'im']

vectorizer = TfidfVectorizer(stop_words=exclude_words)

tfidf_matrix = vectorizer.fit_transform(chapters)

feature_names = vectorizer.get_feature_names_out()

top_words_per_chapter = {}

for chapter_idx in range(tfidf_matrix.shape[0]):
    row = tfidf_matrix.getrow(chapter_idx)
    
    scores = row.toarray().flatten()
    sorted_indices = scores.argsort()[-10:][::-1]  # Indices of the top 10 scores
    
    top_words_per_chapter[chapter_idx+1] = [(feature_names[idx], scores[idx]) for idx in sorted_indices]

for chapter, words in top_words_per_chapter.items():
    print(f"Chapter {chapter}:")
    for word, score in words:
        print(f"  {word}: {score:.5f}")
    print("\n")

Chapter 1:
  say: 0.17505
  eat: 0.17460
  think: 0.16411
  go: 0.16411
  little: 0.16411
  bat: 0.16190
  get: 0.15317
  door: 0.14627
  key: 0.14302
  see: 0.14223


Chapter 2:
  mouse: 0.30017
  go: 0.22219
  say: 0.20102
  pool: 0.18441
  little: 0.17986
  cat: 0.15009
  cry: 0.14145
  dear: 0.13356
  fan: 0.13047
  foot: 0.12348


Chapter 3:
  say: 0.41651
  mouse: 0.38774
  dodo: 0.30817
  lory: 0.15409
  prize: 0.14952
  dry: 0.14253
  know: 0.13536
  thimble: 0.11961
  bird: 0.11078
  cause: 0.10272


Chapter 4:
  bill: 0.20510
  little: 0.20102
  window: 0.20080
  rabbit: 0.18177
  puppy: 0.17570
  say: 0.16606
  grow: 0.15540
  fan: 0.15089
  go: 0.14858
  get: 0.13984


Chapter 5:
  say: 0.45774
  caterpillar: 0.45489
  serpent: 0.27675
  pigeon: 0.27675
  egg: 0.13837
  youth: 0.13837
  size: 0.10993
  father: 0.09903
  think: 0.09637
  little: 0.08834


Chapter 6:
  say: 0.39907
  cat: 0.32044
  footman: 0.25949
  baby: 0.20428
  mad: 0.18045
  go: 0.17318
  pig: 0.16714
 

**Chapter's names**

1. {say, eat, think, go, little, bat, get, door, key, see} -- 'Bat shows the way to the little door'
2. {mouse, go, say, pool, little, cat, cry, deer, fan, foot} -- 'Cat chases little mouse by the pool'
3. {say, mouse, dodo, lory, prize, dry, know, thimble, bird, cause} -- 'Lory plays thimbles to win a prize from dodo'
4. {bill, little, window, rabbit, puppy, say, grow, fan, go, get} -- 'Bill and little rabbit's adventure'
5. {say, caterpillar, serpent, pigeon, egg, youth, size, father, think, little} -- 'Pigeon fights caterpillar to protect an egg'
6. {say, cat, footman, baby, mad, go, pig, duchess, grin, wow} -- 'Duchess, footman and the baby cook the big'
7. {hatter, say, doormouse, hare, march, go, twinkle, time, tea, well} -- 'Tea-party with the Hatter and Dormouse'
8. {queen, say, hedgehog, king, go, gardener, look, soldier, cat, five} -- 'Royal Tangles in the Garden'
9. {say, turtle, mock, gryphon, duchess, moral, queen, go, think, day} -- 'Tales of the Mock Turtle'
10. {turtle, mock, gryphon, say, dance, lobster, soup, beautiful, join} -- 'Beatiful lobster soup from the Mock Turtle'
11. {king, hatter, say, court, dormouse, witness, queen, officer, juror, begin} -- 'Hatter's trial: Royal Family'
12. {say, king, jury, dream, write, queen, sister, would, slate, rabbit} -- 'Vivid dream: King vs Queen and her sister'

# Part IV

Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

In [10]:
preprocessed_text_verbs = preprocessed_text_ch.split('chapter xii')[1]

In [11]:
nltk.download('averaged_perceptron_tagger')

words = preprocessed_text_verbs.split()

alice_verbs = []

window_size = 5 

for i, word in enumerate(words):
    if word == 'alice':
        # Define the context window
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(words))
        context = words[start:end]

        pos_tags = pos_tag(context)
        for context_word, tag in pos_tags:
            if tag.startswith('V') and context_word != 'alice':
                alice_verbs.append(context_word)

verb_freq = Counter(alice_verbs)

# Get the top 10 most frequent verbs
top_verbs = verb_freq.most_common(10)

print("Top 10 verbs used in the vicinity of 'Alice':")
for verb, freq in top_verbs:
    print(f"{verb}: {freq}")


Top 10 verbs used in the vicinity of 'Alice':
say: 226
go: 50
think: 43
get: 26
come: 26
know: 25
see: 20
begin: 19
take: 13
make: 13


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andreiivlev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Alice **says** a lot of things, **goes** places, **thinks** a lot. She **gets** what she wants, **comes** to several locations. Alice **knows** and **sees** things, she also **begins**, **takes** and **makes** somehting. 