In [None]:
import urllib.request

URL = "https://www.gutenberg.org/cache/epub/10/pg10.txt"

def fetch_text(url: str) -> str:
    """Download the text at `url` and return it as a decoded Python string.
    Uses only urllib (no external programs). Includes a User-Agent header for reliability.
    """
    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req) as resp:
        raw = resp.read()
    # THIS IS TO Decode safely
    try:
        return raw.decode("utf-8")
    except UnicodeDecodeError:
        return raw.decode("latin-1")

pg10 = fetch_text(URL)
print("Downloaded characters:", len(pg10))
print("Preview:", pg10[:200].replace("\n", "\\n"))

In [None]:
def iter_words(s: str):
    """Yield words as contiguous alphabetic sequences (A-Z or a-z)."""
    word_chars = []
    for ch in s:
        if ch.isalpha():
            word_chars.append(ch)
        else:
            if word_chars:
                yield ''.join(word_chars)
                word_chars = []
    if word_chars:
        yield ''.join(word_chars)

# THIS IS THE Lowercased list of words derived from pg10
words_lower = [w.lower() for w in iter_words(pg10)]
print("Sample words:", words_lower[:20])

In [None]:
line_count = len(pg10.splitlines())
print("Line count:", line_count)

In [None]:
word_count = len(words_lower)
print("Word count (alpha sequences):", word_count)

In [None]:
apostle_count = sum(1 for w in words_lower if w == "apostle")
print('Occurrences of "apostle" (case-insensitive):', apostle_count)

In [None]:
freq = {}
for w in words_lower:
    freq[w] = freq.get(w, 0) + 1

most_common_word = None
most_common_count = 0
for w, c in freq.items():
    if c > most_common_count:
        most_common_word, most_common_count = w, c

print("Most common word (case-insensitive):", most_common_word)
print("Its count:", most_common_count)

In [None]:
print("""
ANSWERS
-------
Lines: {line_count}
Words (alpha sequences): {word_count}
\"apostle\" count (case-insensitive): {apostle_count}
Most common word (case-insensitive): {most_common_word} ({most_common_count} times)
""")