In [1]:
import nltk
import re
import time
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

#  Setup 
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

output_dir = "ntlk_output"
os.makedirs(output_dir, exist_ok=True)

# Read and Clean Text
with open("alice29.txt", "r", encoding="utf-8", errors="ignore") as f:
    text = f.read()

# Keep only letters and normalize to lowercase
text = re.sub(r"[^A-Za-z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip().lower()

# Process and Measure Time
start_sent = time.time()
sentences = sent_tokenize(text)
time_sent = time.time() - start_sent

# Measure Word Tokenization
start_word = time.time()
words = word_tokenize(text)
time_word = time.time() - start_word

# Filter words (remove stopwords and short words)
stop_words = set(stopwords.words("english"))
clean_words = [w for w in words if w not in stop_words and len(w) > 2]

# Outputs
# Save cleaned text
with open(f"{output_dir}/nltk_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(clean_words))
print("Saved cleaned.txt")

# Save word list
with open(f"{output_dir}/nltk_words.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(clean_words))
print("Saved words.txt")

# Save Top 10 words
top10 = Counter(clean_words).most_common(10)
with open(f"{output_dir}/nltk_top10words.txt", "w", encoding="utf-8") as f:
    f.write("Word\tCount\n")
    for word, count in top10:
        f.write(f"{word}\t{count}\n")
print("Saved top10words.txt")

# Save Time Comparison
with open(f"timecompared.txt", "w", encoding="utf-8") as f:
    f.write("Framework\tOperation\tSeconds\n")
    f.write(f"NLTK\tsent_tokenize\t{time_sent:.6f}\n")
    f.write(f"NLTK\tword_tokenize\t{time_word:.6f}\n")
print("Saved timecompared.txt")

Saved cleaned.txt
Saved words.txt
Saved top10words.txt
Saved timecompared.txt


In [2]:
import spacy
import re
import time
import os
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

# Setup

nlp = spacy.load("en_core_web_sm")


# Increase max_length for large text
nlp.max_length = 2000000

output_dir = "spacy_output"
os.makedirs(output_dir, exist_ok=True)

# Read and Clean Text (Same as before)
with open("alice29.txt", "r", encoding="utf-8", errors="ignore") as f:
    text = f.read()

text = re.sub(r"[^A-Za-z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip().lower()

# Measure Sentence Tokenization
nlp_sent = spacy.blank("en")
nlp_sent.add_pipe("sentencizer")

start_sent = time.time()
doc_sent = nlp_sent(text)
sentences = list(doc_sent.sents)
time_sent = time.time() - start_sent

# Measure Word Tokenization
start_word = time.time()
doc = nlp.make_doc(text) # Tokenizer only
words = [token.text for token in doc]
time_word = time.time() - start_word

# Filter words
clean_words = [w for w in words if w not in STOP_WORDS and len(w) > 2]

# Outputs

# Save cleaned text
with open(f"{output_dir}/spacy_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(clean_words))
print("Saved spacy_cleaned.txt")

# Save word list
with open(f"{output_dir}/spacy_words.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(clean_words))
print("Saved spacy_words.txt")

# Save Top 10 words
top10 = Counter(clean_words).most_common(10)
with open(f"{output_dir}/spacy_top10words.txt", "w", encoding="utf-8") as f:
    f.write("Word\tCount\n")
    for word, count in top10:
        f.write(f"{word}\t{count}\n")
print("Saved spacy_top10words.txt")

# Append Time Comparison to the existing file
# Assuming the previous cell created 'ntlk_output/timecompared.txt'
compare_file = "timecompared.txt"
if os.path.exists(compare_file):
    with open(compare_file, "a", encoding="utf-8") as f:
        f.write(f"SpaCy\tsent_tokenize\t{time_sent:.6f}\n")
        f.write(f"SpaCy\tword_tokenize\t{time_word:.6f}\n")
    print(f"Appended SpaCy performance to {compare_file}")
else:
    with open(f"timecompared.txt", "w", encoding="utf-8") as f:
        f.write("Framework\tOperation\tSeconds\n")
        f.write(f"SpaCy\tsent_tokenize\t{time_sent:.6f}\n")
        f.write(f"SpaCy\tword_tokenize\t{time_word:.6f}\n")
    print(f"Saved timecompared.txt to {output_dir}")

Saved spacy_cleaned.txt
Saved spacy_words.txt
Saved spacy_top10words.txt
Appended SpaCy performance to timecompared.txt


In [3]:
from textblob import TextBlob
import re
import time
import os
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Setup
# TextBlob relies on NLTK corpora
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

output_dir = "textblob_output"
os.makedirs(output_dir, exist_ok=True)

# Read and Clean Text
with open("alice29.txt", "r", encoding="utf-8", errors="ignore") as f:
    text = f.read()

text = re.sub(r"[^A-Za-z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip().lower()

# Measure Sentence Tokenization
start_sent = time.time()
blob_sent = TextBlob(text)
sentences = blob_sent.sentences
time_sent = time.time() - start_sent

# Measure Word Tokenization
start_word = time.time()
blob_word = TextBlob(text)
words = blob_word.words
time_word = time.time() - start_word

# Filter words
stop_words = set(stopwords.words("english"))
# Convert TextBlob Word objects to string for consistent filtering
clean_words = [str(w) for w in words if str(w) not in stop_words and len(w) > 2]

# Outputs

# Save cleaned text
with open(f"{output_dir}/textblob_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(clean_words))
print("Saved textblob_cleaned.txt")

# Save word list
with open(f"{output_dir}/textblob_words.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(clean_words))
print("Saved textblob_words.txt")

# Save Top 10 words
top10 = Counter(clean_words).most_common(10)
with open(f"{output_dir}/textblob_top10words.txt", "w", encoding="utf-8") as f:
    f.write("Word\tCount\n")
    for word, count in top10:
        f.write(f"{word}\t{count}\n")
print("Saved textblob_top10words.txt")

# Append Time Comparison
compare_file = "timecompared.txt"
if os.path.exists(compare_file):
    with open(compare_file, "a", encoding="utf-8") as f:
        f.write(f"TextBlob\tsent_tokenize\t{time_sent:.6f}\n")
        f.write(f"TextBlob\tword_tokenize\t{time_word:.6f}\n")
    print(f"Appended TextBlob performance to {compare_file}")
else:
    with open(f"timecompared.txt", "w", encoding="utf-8") as f:
        f.write("Framework\tOperation\tSeconds\n")
        f.write(f"TextBlob\tsent_tokenize\t{time_sent:.6f}\n")
        f.write(f"TextBlob\tword_tokenize\t{time_word:.6f}\n")
    print(f"Saved timecompared.txt to {output_dir}")

Saved textblob_cleaned.txt
Saved textblob_words.txt
Saved textblob_top10words.txt
Appended TextBlob performance to timecompared.txt
