# Import Library

In [3]:
import re
from collections import Counter

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    word_tokenize("test")
except LookupError:
    nltk.download('punkt')

# TextBlob
from textblob import TextBlob

# spaCy
import spacy
try:
    nlp_spacy = spacy.load("en_core_web_sm")
except OSError:
    print("Warning: spaCy model 'en_core_web_sm' not found. Please run 'python -m spacy download en_core_web_sm'")
    nlp_spacy = None

# Load text from alice29.txt

In [2]:
file_path = "alice29.txt"
with open('alice29.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Text Cleaning

In [17]:
def clean_text_and_remove_stopwords(input_text):

    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', input_text)

    cleaned_text = cleaned_text.lower()

    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    english_stopwords = set(stopwords.words('english'))

    words = cleaned_text.split()
    filtered_words = [word for word in words if word not in english_stopwords and word.strip()]

    final_cleaned_text = ' '.join(filtered_words)

    return final_cleaned_text, filtered_words

cleaned_text, all_filtered_words = clean_text_and_remove_stopwords(text)

with open('cleaned.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_text)

print(cleaned_text[:500] + "...")

alice adventures wonderland lewis carroll millennium fulcrum edition 2 9 chapter rabbit hole alice beginning get tired sitting sister bank nothing twice peeped book sister reading pictures conversations use book thought alice without pictures conversation considering mind well could hot day made feel sleepy stupid whether pleasure making daisy chain would worth trouble getting picking daisies suddenly white rabbit pink eyes ran close nothing remarkable alice think much way hear rabbit say oh dea...


# Tokenization

In [18]:
sentences = sent_tokenize(text)
words_from_original_text = word_tokenize(text)

with open('words.txt', 'w', encoding='utf-8') as f:
    f.write("--- Tokenized Sentences ---\n")
    for i, sentence in enumerate(sentences):
        f.write(f"Sentence {i+1}: {sentence}\n")

    f.write("\n--- Tokenized Words ---\n")
    f.write(', '.join(words_from_original_text))

    f.write("\n\n--- Filtered Words (Lowercase, No Stopwords, No Punctuation) ---\n")
    f.write(', '.join(all_filtered_words))


print(f"Total Sentences (NLTK): {len(sentences)}")
print(f"Total Words (NLTK, including punctuation): {len(words_from_original_text)}")
print(f"Total Filtered Words (Cleaned): {len(all_filtered_words)}")

Total Sentences (NLTK): 1614
Total Words (NLTK, including punctuation): 34428
Total Filtered Words (Cleaned): 12244


# Frequency Analysis

In [7]:
word_counts = Counter(all_filtered_words)
top_10_words = word_counts.most_common(10)

top_10_table = "--- Top 10 Most Frequent Words ---\n"
top_10_table += "{:<15} {:<5}\n".format("Word", "Count")
top_10_table += "-" * 20 + "\n"
for word, count in top_10_words:
    top_10_table += "{:<15} {:<5}\n".format(word, count)

with open('top10words.txt', 'w', encoding='utf-8') as f:
    f.write(top_10_table)

print(top_10_table)

--- Top 10 Most Frequent Words ---
Word            Count
--------------------
said            462  
alice           398  
little          128  
one             104  
know            88   
like            85   
would           83   
went            83   
could           77   
queen           75   



# Framework Performance Comparison

In [16]:
import timeit
import spacy

try:
    nlp_spacy = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
    print("spaCy model loaded (tokenization-only) successfully.")
except OSError:
    print("Error: spaCy model 'en_core_web_sm' not found.")
    nlp_spacy = None

NUM_EXECUTIONS = 50
NUM_REPEATS = 5
results = {}

print("\n--- Comparing Framework Performance (Word Tokenization) using timeit ---")

SETUP_NLTK = """
from nltk.tokenize import word_tokenize
text = globals()['text']
"""
STMT_NLTK = "word_tokenize(text)"

t_nltk = timeit.repeat(
    setup=SETUP_NLTK,
    stmt=STMT_NLTK,
    repeat=NUM_REPEATS,
    number=NUM_EXECUTIONS,
    globals={'text': text}
)

avg_time_nltk = min(t_nltk) / NUM_EXECUTIONS
results['NLTK'] = avg_time_nltk
print(f"NLTK Time: {avg_time_nltk:.6f} seconds ")


SETUP_TEXTBLOB = """
from textblob import TextBlob
text = globals()['text']
"""
STMT_TEXTBLOB = "TextBlob(text).words"

t_textblob = timeit.repeat(
    setup=SETUP_TEXTBLOB,
    stmt=STMT_TEXTBLOB,
    repeat=NUM_REPEATS,
    number=NUM_EXECUTIONS,
    globals={'text': text}
)

avg_time_textblob = min(t_textblob) / NUM_EXECUTIONS
results['TextBlob'] = avg_time_textblob
print(f"TextBlob Time: {avg_time_textblob:.6f} seconds ")

if nlp_spacy:
    SETUP_SPACY = """
nlp_spacy = globals()['nlp_spacy']
text = globals()['text']
"""
    STMT_SPACY = "doc = nlp_spacy.make_doc(text); [t.text for t in doc]"

    t_spacy = timeit.repeat(
        setup=SETUP_SPACY,
        stmt=STMT_SPACY,
        repeat=NUM_REPEATS,
        number=NUM_EXECUTIONS,
        globals={'text': text, 'nlp_spacy': nlp_spacy}
    )

    avg_time_spacy = min(t_spacy) / NUM_EXECUTIONS
    results['spaCy'] = avg_time_spacy
    print(f"spaCy Time: {avg_time_spacy:.6f} seconds ")


compare_table = (
    f"--- Framework Performance Comparison "
    f"(Avg. Word Tokenization Time over {NUM_EXECUTIONS} executions) ---\n"
)
compare_table += "{:<10} {:<15}\n".format("Framework", "Time (Seconds)")
compare_table += "-" * 25 + "\n"

for framework, t in results.items():
    compare_table += "{:<10} {:<15.6f}\n".format(framework, t)

compare_table += "-" * 25 + "\n"

with open('time_compare.txt', 'w', encoding='utf-8') as f:
    f.write(compare_table)



spaCy model loaded (tokenization-only) successfully.

--- Comparing Framework Performance (Word Tokenization) using timeit ---
NLTK Time: 0.049032 seconds 
TextBlob Time: 0.070582 seconds 
spaCy Time: 0.100722 seconds 
