# Study and exploration of NLTK

In [84]:
# %pip install nltk[all]

In [85]:
import nltk

In [86]:
# nltk.download('all')

### NLTK modules

corpora : a package containing modoles if example text<br>
wordnet : interface to the WordNet lexical resource<br>
chunk : identify short non-nested phrases in text

In [87]:
from nltk.book import *

In [88]:
# List all corpora in NLTK
import os
import nltk

corpora_dir = nltk.data.find("corpora")
all_corpora = os.listdir(corpora_dir)
all_corpora


['abc',
 'abc.zip',
 'alpino',
 'alpino.zip',
 'bcp47.zip',
 'biocreative_ppi',
 'biocreative_ppi.zip',
 'brown',
 'brown.zip',
 'brown_tei',
 'brown_tei.zip',
 'cess_cat',
 'cess_cat.zip',
 'cess_esp',
 'cess_esp.zip',
 'chat80',
 'chat80.zip',
 'city_database',
 'city_database.zip',
 'cmudict',
 'cmudict.zip',
 'comparative_sentences',
 'comparative_sentences.zip',
 'comtrans.zip',
 'conll2000',
 'conll2000.zip',
 'conll2002',
 'conll2002.zip',
 'conll2007.zip',
 'crubadan',
 'crubadan.zip',
 'dependency_treebank',
 'dependency_treebank.zip',
 'dolch',
 'dolch.zip',
 'english_wordnet',
 'english_wordnet.zip',
 'europarl_raw',
 'europarl_raw.zip',
 'extended_omw.zip',
 'floresta',
 'floresta.zip',
 'framenet_v15',
 'framenet_v15.zip',
 'framenet_v17',
 'framenet_v17.zip',
 'gazetteers',
 'gazetteers.zip',
 'genesis',
 'genesis.zip',
 'gutenberg',
 'gutenberg.zip',
 'ieer',
 'ieer.zip',
 'inaugural',
 'inaugural.zip',
 'indian',
 'indian.zip',
 'jeita.zip',
 'kimmo',
 'kimmo.zip',
 'kn

In [89]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [90]:
sents()

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [91]:
text7

<Text: Wall Street Journal>

In [92]:
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [93]:
len(sent7)

18

In [94]:
len(text7)

100676

In [95]:
len(set(text7))

12408

In [96]:
from nltk.corpus import gutenberg

In [97]:
print(gutenberg.fileids())
hamlet = gutenberg.words('shakespeare-hamlet.txt')
print(len(hamlet))
print(hamlet[1:10])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
37360
['The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']


String processing

In [98]:
sentence = "The quick brown fox jumps over the lazy dog. NLTK is a powerful library for natural language processing."
tokens = nltk.word_tokenize(sentence) # also can use - WordPunctTokenizer
print(tokens)

tagged = nltk.pos_tag(tokens)
print(tagged)

sens = nltk.sent_tokenize(sentence)
print(sens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '.']
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]
['The quick brown fox jumps over the lazy dog.', 'NLTK is a powerful library for natural language processing.']


Stemming

In [99]:
from nltk.stem.porter import PorterStemmer
# more such libraries - SnowballStemmer, LancasterStemmer, RegexpStemmer

Lemmatization

In [100]:
from nltk.stem import WordNetLemmatizer

POS tagging

In [101]:
text = "The striped bats are hanging on their feet for best"
nltk.pos_tag(nltk.word_tokenize(text))

[('The', 'DT'),
 ('striped', 'JJ'),
 ('bats', 'NNS'),
 ('are', 'VBP'),
 ('hanging', 'VBG'),
 ('on', 'IN'),
 ('their', 'PRP$'),
 ('feet', 'NNS'),
 ('for', 'IN'),
 ('best', 'JJS')]

## Tasks
1) Stopword removal
2) Tokenization (words,sentences,punctuation)
3) Stemming
4) Lemmatization
5) POS tagging (all)
6) Removing punctuation
7) Lowercasing
8) split
9) Additional functions
10) WordNet(anything other lexical module) - document submission

Explore any 2 corpus and execute suitable NLP commands to demonstrate Text-processing

In [102]:
from nltk.corpus import brown, reuters
brown_sample = ' '.join(brown.words()[:300])
reuters_sample = ' '.join(reuters.words()[:300])

### Task 1: Stopword Removal

In [103]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(f'Stopwords: {stop_words}')
brown_words = brown_sample.split()
reuters_words = reuters_sample.split()

brown_filtered = [w for w in brown_words if w.lower() not in stop_words]
reuters_filtered = [w for w in reuters_words if w.lower() not in stop_words]

print("Brown - Stopword Removal:")
print(f"Original: {len(brown_words)} words")
print(brown_words[:20])
print(f"After removal: {len(brown_filtered)} words")
print(brown_filtered[:20])

print("\nReuters - Stopword Removal:")
print(f"Original: {len(reuters_words)} words")
print(reuters_words[:20])
print(f"After removal: {len(reuters_filtered)} words")
print(reuters_filtered[:20])

Stopwords: {'about', "they'd", 'once', "she'll", 'then', 'all', "i'm", 'nor', "it'll", "they've", 'doing', 'weren', 'herself', 'again', 'haven', 'me', 't', 'when', 'y', 'a', 'their', 'it', 'does', 'ours', 'i', 'ain', 'hasn', 'only', 'ourselves', 'wasn', 'own', 'you', 'by', 'doesn', 'did', "needn't", 'below', 'over', 'at', 'yours', 'during', 'wouldn', 'such', "i'd", 'mustn', 'as', "it'd", 'most', 'just', 'because', 'they', 'any', "he's", "we'll", 'until', 'theirs', 'he', 'them', 'do', 'isn', 'itself', 'few', 'been', "don't", 'before', 'himself', 'what', 'that', 'into', 'yourselves', 'hadn', 'where', 'down', 'under', 'being', 'on', "isn't", 'hers', 'her', 're', "shan't", "weren't", 'these', 'more', 'my', 'm', 'its', 'couldn', 'which', 'were', 'o', 'd', "you'll", 'if', 'after', 'she', 'of', "we're", 'same', "wasn't", "we've", "she'd", 'aren', 'in', 'is', 'no', 'will', 'ma', 'whom', 'very', 'from', 'other', 'with', 'we', 'each', 'don', 'him', 've', 'his', "aren't", 'out', "he'll", "wouldn'

In [104]:
stop_words = ['up','down']
text = 'Hello! What is up?'
words = nltk.word_tokenize(text)

filtered = [w for w in words if w.lower() not in stop_words]
print(filtered)

['Hello', '!', 'What', 'is', '?']


### Task 2: Tokenization

In [105]:
from nltk.tokenize import word_tokenize, sent_tokenize

brown_word_tokens = word_tokenize(brown_sample)
brown_sent_tokens = sent_tokenize(brown_sample)

reuters_word_tokens = word_tokenize(reuters_sample)
reuters_sent_tokens = sent_tokenize(reuters_sample)

print("Brown - Tokenization:")
print(f"Word tokens: {len(brown_word_tokens)}")
print(f"Sentence tokens: {len(brown_sent_tokens)}")
print("First 15 word tokens:", brown_word_tokens[:15])

print("\nReuters - Tokenization:")
print(f"Word tokens: {len(reuters_word_tokens)}")
print(f"Sentence tokens: {len(reuters_sent_tokens)}")
print("First 15 word tokens:", reuters_word_tokens[:15])

Brown - Tokenization:
Word tokens: 302
Sentence tokens: 11
First 15 word tokens: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election']

Reuters - Tokenization:
Word tokens: 305
Sentence tokens: 20
First 15 word tokens: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between']


### Task 3: Stemming

In [106]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

brown_words_for_stem = [w for w in brown_word_tokens if w.isalpha()][:20]
reuters_words_for_stem = [w for w in reuters_word_tokens if w.isalpha()][:20]

brown_stemmed = [(w, stemmer.stem(w)) for w in brown_words_for_stem]
reuters_stemmed = [(w, stemmer.stem(w)) for w in reuters_words_for_stem]

print("Brown - Stemming:")
for original, stemmed in brown_stemmed[:10]:
    print(f"{original} -> {stemmed}")

print("\nReuters - Stemming:")
for original, stemmed in reuters_stemmed[:10]:
    print(f"{original} -> {stemmed}")

Brown - Stemming:
The -> the
Fulton -> fulton
County -> counti
Grand -> grand
Jury -> juri
said -> said
Friday -> friday
an -> an
investigation -> investig
of -> of

Reuters - Stemming:
ASIAN -> asian
EXPORTERS -> export
FEAR -> fear
DAMAGE -> damag
FROM -> from
U -> u
S -> s
JAPAN -> japan
RIFT -> rift
Mounting -> mount


### Task 4: Lemmatization

In [107]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

brown_lemmatized = [(w, lemmatizer.lemmatize(w)) for w in brown_words_for_stem]
reuters_lemmatized = [(w, lemmatizer.lemmatize(w)) for w in reuters_words_for_stem]

print("Brown - Lemmatization:")
for original, lemmatized in brown_lemmatized[:10]:
    print(f"{original} -> {lemmatized}")

print("\nReuters - Lemmatization:")
for original, lemmatized in reuters_lemmatized[:10]:
    print(f"{original} -> {lemmatized}")

Brown - Lemmatization:
The -> The
Fulton -> Fulton
County -> County
Grand -> Grand
Jury -> Jury
said -> said
Friday -> Friday
an -> an
investigation -> investigation
of -> of

Reuters - Lemmatization:
ASIAN -> ASIAN
EXPORTERS -> EXPORTERS
FEAR -> FEAR
DAMAGE -> DAMAGE
FROM -> FROM
U -> U
S -> S
JAPAN -> JAPAN
RIFT -> RIFT
Mounting -> Mounting


### Task 5: POS Tagging

In [108]:
from nltk import pos_tag

brown_pos = pos_tag(brown_word_tokens[:30])
reuters_pos = pos_tag(reuters_word_tokens[:30])

print("Brown - POS Tagging:")
print(brown_pos)

print("\nReuters - POS Tagging:")
print(reuters_pos)

Brown - POS Tagging:
[('The', 'DT'), ('Fulton', 'NNP'), ('County', 'NNP'), ('Grand', 'NNP'), ('Jury', 'NNP'), ('said', 'VBD'), ('Friday', 'NNP'), ('an', 'DT'), ('investigation', 'NN'), ('of', 'IN'), ('Atlanta', 'NNP'), ("'s", 'POS'), ('recent', 'JJ'), ('primary', 'JJ'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'DT'), ('evidence', 'NN'), ('``', '``'), ('that', 'IN'), ('any', 'DT'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.'), ('The', 'DT'), ('jury', 'NN'), ('further', 'RB'), ('said', 'VBD')]

Reuters - POS Tagging:
[('ASIAN', 'NNP'), ('EXPORTERS', 'NNP'), ('FEAR', 'NNP'), ('DAMAGE', 'NNP'), ('FROM', 'NNP'), ('U', 'NNP'), ('.', '.'), ('S', 'NNP'), ('.-', 'JJ'), ('JAPAN', 'NNP'), ('RIFT', 'NNP'), ('Mounting', 'NNP'), ('trade', 'NN'), ('friction', 'NN'), ('between', 'IN'), ('the', 'DT'), ('U', 'NNP'), ('.', '.'), ('S', 'NNP'), ('.', '.'), ('And', 'CC'), ('Japan', 'NNP'), ('has', 'VBZ'), ('raised', 'VBN'), ('fears', 'NNS'), ('among', 'IN'), (

### Task 6: Removing Punctuation

In [109]:
import string

brown_no_punct = [w for w in brown_word_tokens if w not in string.punctuation]
reuters_no_punct = [w for w in reuters_word_tokens if w not in string.punctuation]

print("Brown - Removing Punctuation:")
print(f"Original: {len(brown_word_tokens)} tokens")
print(f"After removal: {len(brown_no_punct)} tokens")
print(brown_no_punct[:20])

print("\nReuters - Removing Punctuation:")
print(f"Original: {len(reuters_word_tokens)} tokens")
print(f"After removal: {len(reuters_no_punct)} tokens")
print(reuters_no_punct[:20])

Brown - Removing Punctuation:
Original: 302 tokens
After removal: 283 tokens
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', '``']

Reuters - Removing Punctuation:
Original: 305 tokens
After removal: 263 tokens
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', 'S', 'And', 'Japan', 'has']


### Task 7: Lowercasing

In [110]:
brown_lower = [w.lower() for w in brown_word_tokens[:30]]
reuters_lower = [w.lower() for w in reuters_word_tokens[:30]]

print("Brown - Lowercasing:")
print("Original:", brown_word_tokens[:15])
print("Lowercased:", brown_lower[:15])

print("\nReuters - Lowercasing:")
print("Original:", reuters_word_tokens[:15])
print("Lowercased:", reuters_lower[:15])

Brown - Lowercasing:
Original: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election']
Lowercased: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlanta', "'s", 'recent', 'primary', 'election']

Reuters - Lowercasing:
Original: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between']
Lowercased: ['asian', 'exporters', 'fear', 'damage', 'from', 'u', '.', 's', '.-', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between']


### Task 8: Split

In [111]:
brown_split = brown_sample.split()
reuters_split = reuters_sample.split()

print("Brown - Split:")
print(f"Total tokens: {len(brown_split)}")
print(brown_split[:20])

print("\nReuters - Split:")
print(f"Total tokens: {len(reuters_split)}")
print(reuters_split[:20])

Brown - Split:
Total tokens: 300
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']

Reuters - Split:
Total tokens: 300
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.']


### Task 9: Addiional function

### Task 10: Wordnet