In [None]:
# Downloading all of the nltk packages
import os
import pathlib
import nltk
import json


# Use the notebook's current working directory
NOTEBOOK_DIR = pathlib.Path().resolve()
NLTK_DIR = NOTEBOOK_DIR / "nltk_data"
NLTK_DIR.mkdir(exist_ok=True)

# Make NLTK look here first
nltk.data.path.insert(0, str(NLTK_DIR))

nltk.download('popular',    download_dir=NLTK_DIR)
nltk.download('punkt',      download_dir=NLTK_DIR)  # tokenizer models
nltk.download('punkt_tab',  download_dir=NLTK_DIR)  # tokenizer lookup tables (needed in newer NLTK)


## Common nltk functions

### nltk.sent_tokenize
splits a text into individual sentences using trained punctuation and language rules (Punkt tokenizer).

In [16]:
text = "Backgammon is one of the oldest known board games. Its history goes back almost 5,000 years to archaeological discoveries in the Middle East. It is a two-player game in which each player has fifteen checkers that move between twenty-four points according to the roll of two dice."

sentences = nltk.sent_tokenize(text)

print(json.dumps(sentences, indent=2, ensure_ascii=False))
print(f"Number of sentences: {len(sentences)}")

[
  "Backgammon is one of the oldest known board games.",
  "Its history goes back almost 5,000 years to archaeological discoveries in the Middle East.",
  "It is a two-player game in which each player has fifteen checkers that move between twenty-four points according to the roll of two dice."
]
Number of sentences: 3


In [9]:
text_line_list = ("Backgammon is one of the oldest known board games. "
        "Its history goes back almost 5,000 years to archaeological discoveries in the Middle East. "
        "It is a two-player game in which each player has fifteen checkers that move between twenty-four points "
        "according to the roll of two dice.")

sentences_line_list = nltk.sent_tokenize(text_line_list)

print(json.dumps(sentences_line_list, indent=2, ensure_ascii=False))
print(f"Number of sentences: {len(sentences_line_list)}")

[
  "Backgammon is one of the oldest known board games.",
  "Its history goes back almost 5,000 years to archaeological discoveries in the Middle East.",
  "It is a two-player game in which each player has fifteen checkers that move between twenty-four points according to the roll of two dice."
]
Number of sentences: 3


In [12]:
words = nltk.word_tokenize(text)

print(json.dumps(words, ensure_ascii=False))
print(f"Number of words: {len(words)}")


["Backgammon", "is", "one", "of", "the", "oldest", "known", "board", "games", ".", "Its", "history", "goes", "back", "almost", "5,000", "years", "to", "archaeological", "discoveries", "in", "the", "Middle", "East", ".", "It", "is", "a", "two-player", "game", "in", "which", "each", "player", "has", "fifteen", "checkers", "that", "move", "between", "twenty-four", "points", "according", "to", "the", "roll", "of", "two", "dice", "."]
Number of words: 50


- Stemmer: A tool that chops words down to their base form by heuristic rules (e.g., “running” → “run”, “studies” → “studi”). Fast, crude, may produce non-words.
- Lemmatizer: A tool that reduces words to their dictionary form (lemma) using vocabulary and grammar (e.g., “better” → “good”, “running” → “run”). Slower, more accurate, returns real words.

In [18]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet


def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    # Print the results of the stem and lemmatize, using word and pos (Part of Speech)
    print(f"Stemmer: {stemmer.stem(word)}")
    print(f"Lemmatizer: {lemmatizer.lemmatize(word,pos)}")


stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

compare_stemmer_and_lemmatizer(stemmer,lemmatizer,word='seen',pos=wordnet.VERB)
print()
compare_stemmer_and_lemmatizer(stemmer,lemmatizer,word='drove',pos=wordnet.VERB)


Stemmer: seen
Lemmatizer: see

Stemmer: drove
Lemmatizer: drive


## Stop words are words that are filtered out before or after processing the text.

When applying machine learning to text, these words can add a lot of noise. That’s why we want to remove these irrelevant words.

Stop words generally refer to the most common words like “and,” “the,” “a” in a language, but there is no single universal stop‑word list. The list of stop words can change depending on your application.

The NLTK toolkit has a predefined list of keywords that refers to the most common words. If you are using it for the first time, you need to download the stop words with this code: nltk.download("stopwords"). Once the download is complete, you can load the stop‑words package from nltk.corpus and use it to load the stop words.

In [22]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [23]:
stop_words = set(stopwords.words("english"))
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.', 'Its', 'history', 'goes', 'back', 'almost', '5,000', 'years', 'archaeological', 'discoveries', 'Middle', 'East', '.', 'It', 'two-player', 'game', 'player', 'fifteen', 'checkers', 'move', 'twenty-four', 'points', 'according', 'roll', 'two', 'dice', '.']


However, keep in mind that list comprehensions are faster because they’re optimized so that Python’s interpreter can detect a predictable pattern during the loop.

You might wonder why we convert our list into a set. A set is an abstract data type that can store unique values with no particular order. Lookups in a set are much faster than lookups in a list. For a small number of words there’s not much difference, but if you have a large number of words, it’s strongly recommended to use a set.