In [1]:
# request the raw text of Alice's Adventures in Wonderland
import requests
r = requests.get(r'https://www.gutenberg.org/cache/epub/11/pg11.txt')
alice = r.text

# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
    alice = alice.replace(char, " ")

# (removing the project gutenburg introduction/footnotes)
alice = alice[1453:148810] # from "Chapter 1" to "The end"
print(alice)



In [3]:
# print out some information about the text

# what's the data type of your text
print(f"the type of your data: {type(alice)}")

# how long is your text (in characters)?
print(f"length = {len(alice)} characters")


the type of your data: <class 'str'>
length = 147357 characters


In [52]:
#pip install nltk


# Sentence Tokenization

In [4]:
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

sent_tokenize(alice)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['CHAPTER I.',
 'Down the Rabbit-Hole      Alice was beginning to get very tired of sitting by her sister on the  bank, and of having nothing to do: once or twice she had peeped into  the book her sister was reading, but it had no pictures or  conversations in it, “and what is the use of a book,” thought Alice  “without pictures or conversations?”    So she was considering in her own mind (as well as she could, for the  hot day made her feel very sleepy and stupid), whether the pleasure of  making a daisy-chain would be worth the trouble of getting up and  picking the daisies, when suddenly a White Rabbit with pink eyes ran  close by her.',
 'There was nothing so _very_ remarkable in that; nor did Alice think it  so _very_ much out of the way to hear the Rabbit say to itself, “Oh  dear!',
 'Oh dear!',
 'I shall be late!” (when she thought it over afterwards,  it occurred to her that she ought to have wondered at this, but at the  time it all seemed quite natural); but when the Rabbit a

In [5]:
for sent in sent_tokenize(alice):
    print(word_tokenize(sent))

['CHAPTER', 'I', '.']
['Down', 'the', 'Rabbit-Hole', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', '“', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', '”', 'thought', 'Alice', '“', 'without', 'pictures', 'or', 'conversations', '?', '”', 'So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', ')', ',', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy-chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisies', ',', 'when', 'suddenly', 'a', 'White', 'Rabbit', 'with', 'pin

# Lowercasing

In [7]:
for sent in sent_tokenize(alice):

    print([word.lower() for word in word_tokenize(sent)])

['chapter', 'i', '.']
['down', 'the', 'rabbit-hole', 'alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', '“', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', '”', 'thought', 'alice', '“', 'without', 'pictures', 'or', 'conversations', '?', '”', 'so', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', ')', ',', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy-chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisies', ',', 'when', 'suddenly', 'a', 'white', 'rabbit', 'with', 'pin

In [8]:
# Treats the whole line as one document
print(word_tokenize(alice))



# Tokenize and lowercase

In [9]:
# Treat the multiple sentences as one document (no need to sent_tokenize)
# Tokenize and lowercase
alice_tokenized_lowered = list(map(str.lower, word_tokenize(alice)))
print(alice_tokenized_lowered)



# Stopwords

In [10]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words('english')) # Set checking is faster in Python than list.

# List comprehension.
print([word for word in alice_tokenized_lowered if word not in stopwords_en])



# Punctuation

In [12]:
from string import punctuation
# It's a string so we have to them into a set type
print('From string.punctuation:', type(punctuation), punctuation)

From string.punctuation: <class 'str'> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
stopwords_en_withpunct = stopwords_en.union(set(punctuation))
print(stopwords_en_withpunct)

{'below', 'should', ']', 'am', ';', 'in', "needn't", 'too', 'hers', 'your', '\\', 'down', 'you', 'against', 'aren', '?', 'being', 'because', 'very', 'for', 'then', 'needn', 'he', 'and', 'before', 'such', '}', 'don', "shouldn't", 'which', "don't", 'won', 'now', 'only', '(', 'doing', 'than', 'some', 'him', 's', 'do', "hadn't", 'had', '*', 'the', 'why', "couldn't", '_', '~', "you'll", 'as', 'his', 'once', "haven't", 'their', 're', '`', 'mightn', 'so', 'yourself', '+', 't', 'through', 'above', "wouldn't", '/', 'was', 'haven', 'wasn', 'yourselves', "it's", 'what', 'when', 'will', 'be', 'can', '<', '=', 'with', 'here', ')', 'll', 'from', 'there', 'hasn', 'ours', '>', 'on', 'were', 'further', 'ma', 'having', 'both', 'how', "you're", 'while', 'they', 'again', 'it', 'her', "mightn't", "she's", ',', 'themselves', 'himself', 'didn', '|', 'itself', '$', 'off', "that'll", 'herself', '-', 'of', "hasn't", "wasn't", 'mustn', ':', '.', 'those', '#', 'our', "you've", 'same', 'that', 'ain', 'who', "weren

In [14]:
print([word for word in alice_tokenized_lowered if word not in stopwords_en_withpunct])



# Using a strong list of stopwords

In [15]:
# Stopwords from stopwords-json
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
# Combine the stopwords.
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

# Remove the stopwords from `alice`.
print('With combined stopwords:')
print([word for word in alice_tokenized_lowered if word not in stoplist_combined])

With combined stopwords:


# Stemming and Lemmatization

In [16]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def lemmatize_sent(text):
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag))
            for word, tag in pos_tag(word_tokenize(text))]

In [17]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

alice2 = [word for word in lemmatize_sent(alice)
       if word not in stoplist_combined
       and not word.isdigit() ]

print('Lemmatized and removed stopwords:')
print(alice2)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Lemmatized and removed stopwords:
['chapter', 'rabbit-hole', 'alice', 'begin', 'tired', 'sit', 'sister', 'bank', 'peep', 'book', 'sister', 'read', 'picture', 'conversation', '“', 'book', '”', 'alice', '“', 'picture', 'conversation', '”', 'mind', 'hot', 'day', 'make', 'feel', 'sleepy', 'stupid', 'pleasure', 'make', 'daisy-chain', 'worth', 'trouble', 'pick', 'daisy', 'suddenly', 'white', 'rabbit', 'pink', 'eye', 'run', 'close', '_very_', 'remarkable', 'alice', '_very_', 'hear', 'rabbit', '“', 'dear', 'dear', 'late', '”', 'occur', 'time', 'natural', 'rabbit', '_took', 'watch', 'waistcoat-pocket_', 'hurry', 'alice', 'start', 'foot', 'flash', 'mind', 'rabbit', 'waistcoat-pocket', 'watch', 'burn', 'curiosity', 'run', 'field', 'fortunately', 'time', 'pop', 'large', 'rabbit-hole', 'hedge', 'moment', 'alice', 'world', 'rabbit-hole', 'straight', 'tunnel', 'dip', 'suddenly', 'suddenly', 'alice', 'moment', 'stop', 'find', 'fall', 'deep', 'deep', 'fell', 'slowly', 'plenty', 'time', 'happen', 'make'

# Vectorization

In [18]:
def preprocess_text(text):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    return [word for word in lemmatize_sent(text)
            if word not in stoplist_combined
            and not word.isdigit()]

In [19]:
from collections import Counter

sent = alice


# Lemmatize and remove stopwords
processed_sent = preprocess_text(sent)

print('Processed sentence:')
print(processed_sent)
print()
print('Word counts:')
print(Counter(processed_sent))

Processed sentence:
['chapter', 'rabbit-hole', 'alice', 'begin', 'tired', 'sit', 'sister', 'bank', 'peep', 'book', 'sister', 'read', 'picture', 'conversation', '“', 'book', '”', 'alice', '“', 'picture', 'conversation', '”', 'mind', 'hot', 'day', 'make', 'feel', 'sleepy', 'stupid', 'pleasure', 'make', 'daisy-chain', 'worth', 'trouble', 'pick', 'daisy', 'suddenly', 'white', 'rabbit', 'pink', 'eye', 'run', 'close', '_very_', 'remarkable', 'alice', '_very_', 'hear', 'rabbit', '“', 'dear', 'dear', 'late', '”', 'occur', 'time', 'natural', 'rabbit', '_took', 'watch', 'waistcoat-pocket_', 'hurry', 'alice', 'start', 'foot', 'flash', 'mind', 'rabbit', 'waistcoat-pocket', 'watch', 'burn', 'curiosity', 'run', 'field', 'fortunately', 'time', 'pop', 'large', 'rabbit-hole', 'hedge', 'moment', 'alice', 'world', 'rabbit-hole', 'straight', 'tunnel', 'dip', 'suddenly', 'suddenly', 'alice', 'moment', 'stop', 'find', 'fall', 'deep', 'deep', 'fell', 'slowly', 'plenty', 'time', 'happen', 'make', 'dark', 'sid

In [21]:
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer

# Convert `stoplist_combined` to a list
stoplist_combined = list(stoplist_combined)

with StringIO('\n'.join([sent])) as fin:
    # Create the vectorizer
    count_vect = CountVectorizer(stop_words=stoplist_combined, tokenizer=word_tokenize)
    count_vect.fit_transform(fin)

# Check the vocabulary in our vectorizer, the values are the IDs given to each word.
count_vect.vocabulary_

{'chapter': 408,
 'i.': 1138,
 'rabbit-hole': 1776,
 'alice': 149,
 'beginning': 262,
 'tired': 2311,
 'sitting': 2039,
 'sister': 2034,
 'bank': 229,
 'peeped': 1620,
 'book': 300,
 'reading': 1800,
 'pictures': 1642,
 'conversations': 501,
 '“': 2634,
 '”': 2635,
 'thought': 2280,
 'mind': 1432,
 'hot': 1117,
 'day': 585,
 'made': 1376,
 'feel': 839,
 'sleepy': 2054,
 'stupid': 2178,
 'pleasure': 1678,
 'making': 1383,
 'daisy-chain': 575,
 'worth': 2566,
 'trouble': 2363,
 'picking': 1639,
 'daisies': 574,
 'suddenly': 2187,
 'white': 2519,
 'rabbit': 1775,
 'pink': 1655,
 'eyes': 793,
 'ran': 1784,
 'close': 455,
 '_very_': 84,
 'remarkable': 1825,
 'hear': 1067,
 'dear': 593,
 'late': 1270,
 'occurred': 1541,
 'wondered': 2548,
 'time': 2302,
 'natural': 1494,
 '_took': 79,
 'watch': 2494,
 'waistcoat-pocket_': 2474,
 'looked': 1349,
 'hurried': 1132,
 'started': 2142,
 'feet': 842,
 'flashed': 890,
 'waistcoat-pocket': 2473,
 'burning': 346,
 'curiosity': 556,
 'field': 851,
 'fo

In [22]:
from operator import itemgetter

# Print the words sorted by their index
words_sorted_by_index, _ = zip(*sorted(count_vect.vocabulary_.items(), key=itemgetter(1)))

print(preprocess_text(sent))
print()
print('Vocab:', words_sorted_by_index)
print()
print('Matrix/Vectors:\n', count_vect.transform([sent]).toarray())

['chapter', 'rabbit-hole', 'alice', 'begin', 'tired', 'sit', 'sister', 'bank', 'peep', 'book', 'sister', 'read', 'picture', 'conversation', '“', 'book', '”', 'alice', '“', 'picture', 'conversation', '”', 'mind', 'hot', 'day', 'make', 'feel', 'sleepy', 'stupid', 'pleasure', 'make', 'daisy-chain', 'worth', 'trouble', 'pick', 'daisy', 'suddenly', 'white', 'rabbit', 'pink', 'eye', 'run', 'close', '_very_', 'remarkable', 'alice', '_very_', 'hear', 'rabbit', '“', 'dear', 'dear', 'late', '”', 'occur', 'time', 'natural', 'rabbit', '_took', 'watch', 'waistcoat-pocket_', 'hurry', 'alice', 'start', 'foot', 'flash', 'mind', 'rabbit', 'waistcoat-pocket', 'watch', 'burn', 'curiosity', 'run', 'field', 'fortunately', 'time', 'pop', 'large', 'rabbit-hole', 'hedge', 'moment', 'alice', 'world', 'rabbit-hole', 'straight', 'tunnel', 'dip', 'suddenly', 'suddenly', 'alice', 'moment', 'stop', 'find', 'fall', 'deep', 'deep', 'fell', 'slowly', 'plenty', 'time', 'happen', 'make', 'dark', 'side', 'notice', 'fill'

# Classification

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import requests
import re

# Step 1: Fetch and preprocess the text
r = requests.get(r'https://www.gutenberg.org/cache/epub/11/pg11.txt')
alice = r.text

# Remove unnecessary characters and keep only the main book content
alice = re.sub(r'[\n\r\t]', ' ', alice)
alice = alice[alice.find("CHAPTER I"):alice.find("THE END")]

# Step 2: Split text into chapters based on "CHAPTER" with Roman numerals
chapters = re.split(r'(CHAPTER [IVXLCDM]+)', alice)[1:]
chapter_titles = chapters[::2]  # Titles: CHAPTER I, CHAPTER II, ...
chapter_texts = chapters[1::2]  # Corresponding text for each chapter

# Ensure non-empty chapters
data = [(title, text.strip()) for title, text in zip(chapter_titles, chapter_texts) if text.strip()]

# Debugging step: Print the first two chapters for verification
print(f"Number of chapters: {len(data)}")
for title, text in data[:2]:
    print(f"\nTitle: {title}\nSample Text: {text[:200]}...\n")

# Step 3: Convert text to feature vectors using CountVectorizer
stoplist_combined = list(stoplist_combined)
vectorizer = CountVectorizer(stop_words=stoplist_combined, tokenizer=word_tokenize)
X = vectorizer.fit_transform([text for _, text in data])
y = [title for title, _ in data]

# Step 4: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Number of chapters: 24

Title: CHAPTER I
Sample Text: .     Down the Rabbit-Hole...


Title: CHAPTER II
Sample Text: .    The Pool of Tears...


Accuracy: 0.375

Classification Report:
               precision    recall  f1-score   support

   CHAPTER I       1.00      1.00      1.00         1
  CHAPTER II       0.00      0.00      0.00         2
  CHAPTER IV       0.00      0.00      0.00         0
  CHAPTER IX       0.50      1.00      0.67         1
   CHAPTER V       0.00      0.00      0.00         1
  CHAPTER VI       0.00      0.00      0.00         0
 CHAPTER VII       0.00      0.00      0.00         1
   CHAPTER X       1.00      1.00      1.00         1
  CHAPTER XI       0.00      0.00      0.00         0
 CHAPTER XII       0.00      0.00      0.00         1

    accuracy                           0.38         8
   macro avg       0.25      0.30      0.27         8
weighted avg       0.31      0.38      0.33         8



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
