In [None]:
import re
from collections import Counter

import nltk as nl
import spacy as sp

# Text Tokenization Exercise

This exercise explores the challenges of splitting text into sentences and words when dealing with complex real-world text containing dates, amounts, URLs, emails, acronyms, and multi-word expressions.

## The Challenge

Given a text variable, split it into:
1. **Sentences** - logical units of meaning ending with terminal punctuation
2. **Words (tokens)** - individual meaningful units

In [1]:
# Sample text with challenging elements
text = """Dr. John Smith, Ph.D., earned $1,250.50 on Jan. 15, 2024, for his work at A.I. Corp. You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info. The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M."""

print("Original text:")
print(text)


Original text:
Dr. John Smith, Ph.D., earned $1,250.50 on Jan. 15, 2024, for his work at A.I. Corp. You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info. The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M.


In [30]:
# 1. Sentences

print("\nSENTENCES\n")
sentences = re.split(r"(?<!A\.I)(?<!Jan)(?:\.$|\. )", text)
for sentence in sentences:
    print(sentence)

print("\nWORDS\n")
words = text.split()
for word in words:
    print(word)


SENTENCES

Dr
John Smith, Ph.D., earned $1,250.50 on Jan. 15, 2024, for his work at A.I. Corp
You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info
The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M


WORDS

Dr.
John
Smith,
Ph.D.,
earned
$1,250.50
on
Jan.
15,
2024,
for
his
work
at
A.I.
Corp.
You
can
reach
him
at
j.smith@ai-corp.co.uk
or
visit
https://www.ai-corp.co.uk/team/dr-smith
for
more
info.
The
U.S.A.-based
company
reported
a
23.5%
increase
in
Q3
revenue,
totaling
€2.5M.


# Corpus Tokenization Exercise

This exercise explores the challenges of splitting words in large corpuses and find the most common words. 

## The Challenge

Given a file `shakes.txt` in the book folder. Find the words that are more common in Shakespeare's book. 

In [38]:
with open("book\shakes.txt", "r") as f:
    content = f.read()

words = []
for w in content.split():
    w.lstrip('\ufeff')
    w.strip(r"[\.,]") 
    
    words.append(w)


words_counts = Counter(words)

n = 10
most_common_words = words_counts.most_common(n)

for word, count in most_common_words:
    print(f"{word}: {count}")

  with open("book\shakes.txt", "r") as f:


the: 23407
I: 19540
and: 18358
to: 15682
of: 15649
a: 12586
my: 10824
in: 9633
you: 9129
is: 7874
