# Text preprocessings in NLP

In [9]:
%pip install nltk
%pip install numpy

Note: you may need to restart the kernel to use updated packages.
Collecting numpy
  Downloading numpy-2.0.0-cp39-cp39-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.9 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 60.9/60.9 kB 649.5 kB/s eta 0:00:00
Downloading numpy-2.0.0-cp39-cp39-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/16.5 MB 3.0 MB/s eta 0:00:06
    --------------------------------------- 0.3/16.5 MB 3.5 MB/s eta 0:00:05
   - -------------------------------------- 0.6/16.5 MB 4.4 MB/s eta 0:00:04
   -- ------------------------------------- 1.0/16.5 MB 5.6 MB/s eta 0:00:03
   --- ------------------------------------ 1.4/16.5 MB 6.3 MB/s eta 0:00:03
   --- ------------------------------------ 1.5/16.5 MB 5.8 MB/s eta 0:00:03
   --- ---------------------

## Basics

In [2]:
import nltk
from nltk.corpus import gutenberg

# Load a corpus
nltk.download('gutenberg')
nltk.download('punkt')
corpus = gutenberg.words('austen-emma.txt')

# Display the first 10 words
print(corpus[:10])

# Create a vocabulary
vocabulary = set(corpus)
print(f"Vocabulary size: {len(vocabulary)}")
print(list(vocabulary)[:10])


['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
Vocabulary size: 7811
['genuine', 'unexamined', 'entirely', 'long', '.', 'speedy', 'Astonished', 'prudence', 'hourly', 'stooping']


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Tokenization examples

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text
text = "NLP is fascinating. It has many applications."

# Word Tokenization
word_tokens = word_tokenize(text)
print(f"Word Tokens: {word_tokens}")

# Sentence Tokenization
sent_tokens = sent_tokenize(text)
print(f"Sentence Tokens: {sent_tokens}")


Word Tokens: ['NLP', 'is', 'fascinating', '.', 'It', 'has', 'many', 'applications', '.']
Sentence Tokens: ['NLP is fascinating.', 'It has many applications.']


## Stemming techniques

In [4]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

# Sample words
words = ["running", "jumps", "easily", "happiness"]

# Porter Stemmer
porter = PorterStemmer()
print("Porter Stemmer Results:", [porter.stem(word) for word in words])

# Lancaster Stemmer
lancaster = LancasterStemmer()
print("Lancaster Stemmer Results:", [lancaster.stem(word) for word in words])

# Snowball Stemmer
snowball = SnowballStemmer(language='english')
print("Snowball Stemmer Results:", [snowball.stem(word) for word in words])


Porter Stemmer Results: ['run', 'jump', 'easili', 'happi']
Lancaster Stemmer Results: ['run', 'jump', 'easy', 'happy']
Snowball Stemmer Results: ['run', 'jump', 'easili', 'happi']


## Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download WordNet data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample words
words = [("running", "v"), ("better", "a"), ("happier", "a")]

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize words
lemmatized_words = [lemmatizer.lemmatize(word, pos) for word, pos in words]
print(f"Lemmatized Words: {lemmatized_words}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Lemmatized Words: ['run', 'good', 'happy']


## Stopwords and POS tagging

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize text
words = word_tokenize(text)

# Remove stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print(f"Filtered Words: {filtered_words}")

# POS Tagging
nltk.download('averaged_perceptron_tagger')
pos_tags = pos_tag(filtered_words)
print(f"POS Tags: {pos_tags}")

Filtered Words: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.']
POS Tags: [('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'NNS'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Named Entity Recognition (NER) tagging

In [10]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

# Sample text
text = "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii."

# Tokenize and POS tag the text
words = word_tokenize(text)
pos_tags = pos_tag(words)

# Perform NER tagging
nltk.download('maxent_ne_chunker')
nltk.download('words')
ner_tree = ne_chunk(pos_tags)

print(ner_tree)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\debap\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  on/IN
  August/NNP
  4/CD
  ,/,
  1961/CD
  ,/,
  in/IN
  (GPE Honolulu/NNP)
  ,/,
  (GPE Hawaii/NNP)
  ./.)
