# Study and exploration of NLTK

In [1]:
# %pip install nltk[all]

In [2]:
import nltk

In [3]:
# nltk.download('all')

### NLTK modules

corpora : a package containing modoles if example text<br>
wordnet : interface to the WordNet lexical resource<br>
chunk : identify short non-nested phrases in text

In [4]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [5]:
# List all corpora in NLTK
import os
import nltk

corpora_dir = nltk.data.find("corpora")
all_corpora = os.listdir(corpora_dir)
all_corpora


['abc',
 'abc.zip',
 'alpino',
 'alpino.zip',
 'bcp47.zip',
 'biocreative_ppi',
 'biocreative_ppi.zip',
 'brown',
 'brown.zip',
 'brown_tei',
 'brown_tei.zip',
 'cess_cat',
 'cess_cat.zip',
 'cess_esp',
 'cess_esp.zip',
 'chat80',
 'chat80.zip',
 'city_database',
 'city_database.zip',
 'cmudict',
 'cmudict.zip',
 'comparative_sentences',
 'comparative_sentences.zip',
 'comtrans.zip',
 'conll2000',
 'conll2000.zip',
 'conll2002',
 'conll2002.zip',
 'conll2007.zip',
 'crubadan',
 'crubadan.zip',
 'dependency_treebank',
 'dependency_treebank.zip',
 'dolch',
 'dolch.zip',
 'english_wordnet',
 'english_wordnet.zip',
 'europarl_raw',
 'europarl_raw.zip',
 'extended_omw.zip',
 'floresta',
 'floresta.zip',
 'framenet_v15',
 'framenet_v15.zip',
 'framenet_v17',
 'framenet_v17.zip',
 'gazetteers',
 'gazetteers.zip',
 'genesis',
 'genesis.zip',
 'gutenberg',
 'gutenberg.zip',
 'ieer',
 'ieer.zip',
 'inaugural',
 'inaugural.zip',
 'indian',
 'indian.zip',
 'jeita.zip',
 'kimmo',
 'kimmo.zip',
 'kn

In [6]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [7]:
sents()

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [8]:
text7

<Text: Wall Street Journal>

In [9]:
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [10]:
len(sent7)

18

In [11]:
len(text7)

100676

In [12]:
len(set(text7))

12408

In [13]:
from nltk.corpus import gutenberg

In [14]:
print(gutenberg.fileids())
hamlet = gutenberg.words('shakespeare-hamlet.txt')
print(len(hamlet))
print(hamlet[1:10])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
37360
['The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']


String processing

In [15]:
sentence = "The quick brown fox jumps over the lazy dog. NLTK is a powerful library for natural language processing."
tokens = nltk.word_tokenize(sentence) # also can use - WordPunctTokenizer
print(tokens)

tagged = nltk.pos_tag(tokens)
print(tagged)

sens = nltk.sent_tokenize(sentence)
print(sens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '.']
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]
['The quick brown fox jumps over the lazy dog.', 'NLTK is a powerful library for natural language processing.']


Stemming

In [16]:
from nltk.stem.porter import PorterStemmer
# more such libraries - SnowballStemmer, LancasterStemmer, RegexpStemmer

Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer

POS tagging

In [18]:
text = "The striped bats are hanging on their feet for best"
nltk.pos_tag(nltk.word_tokenize(text))

[('The', 'DT'),
 ('striped', 'JJ'),
 ('bats', 'NNS'),
 ('are', 'VBP'),
 ('hanging', 'VBG'),
 ('on', 'IN'),
 ('their', 'PRP$'),
 ('feet', 'NNS'),
 ('for', 'IN'),
 ('best', 'JJS')]

## Tasks
1) Stopword removal
2) Tokenization (words,sentences,punctuation)
3) Stemming
4) Lemmatization
5) POS tagging (all)
6) Removing punctuation
7) Lowercasing
8) split
9) Additional functions
10) WordNet(anything other lexical module) - document submission

Explore any 2 corpus and execute suitable NLP commands to demonstrate Text-processing

In [19]:
from nltk.corpus import brown, reuters
brown_sample = ' '.join(brown.words()[:300])
reuters_sample = ' '.join(reuters.words()[:300])

### Task 1: Stopword Removal

In [20]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(f'Stopwords: {stop_words}')
brown_words = brown_sample.split()
reuters_words = reuters_sample.split()

brown_filtered = [w for w in brown_words if w.lower() not in stop_words]
reuters_filtered = [w for w in reuters_words if w.lower() not in stop_words]

print("Brown - Stopword Removal:")
print(f"Original: {len(brown_words)} words")
print(brown_words[:20])
print(f"After removal: {len(brown_filtered)} words")
print(brown_filtered[:20])

print("\nReuters - Stopword Removal:")
print(f"Original: {len(reuters_words)} words")
print(reuters_words[:20])
print(f"After removal: {len(reuters_filtered)} words")
print(reuters_filtered[:20])

Stopwords: {"mustn't", 'ma', 'as', 'needn', 'wouldn', 'did', 'more', "she'd", 'and', 'he', 'has', "he's", 'does', 'this', 'its', 'theirs', 'didn', 'mustn', "haven't", 'can', 'that', 'were', 'aren', 'but', 'a', "couldn't", 'most', 'until', 'm', 'hers', "weren't", "hasn't", 'to', 'we', 'his', "that'll", 'yourself', 'if', 's', 'an', "needn't", 'themselves', "shouldn't", "they'll", "we're", 'too', 'wasn', 'out', 'these', "you've", "don't", "it's", 'have', 'just', 'ourselves', 'shan', "you'd", 'it', 'at', 'been', "you're", 'me', 'again', 'was', 'will', 'against', "hadn't", 't', 'under', 'haven', 'my', 'below', 'shouldn', 'some', 'hasn', 've', "wasn't", 'they', 'what', 'whom', 'who', 'down', "he'd", 'own', 'should', 'not', "wouldn't", 'be', 'such', 'which', 'than', "we'd", "doesn't", 'through', 'myself', 'you', 'him', 'off', "i'd", 'am', "i'll", 'above', 'how', 'isn', "they're", 'there', 'with', "you'll", 'by', 'over', 'both', 'no', 'weren', 'only', 'or', 'your', 'on', 'while', 'where', 'll'

In [21]:
stop_words = ['up','down']
text = 'Hello! What is up?'
words = nltk.word_tokenize(text)

filtered = [w for w in words if w.lower() not in stop_words]
print(filtered)

['Hello', '!', 'What', 'is', '?']


### Task 2: Tokenization

In [22]:
from nltk.tokenize import word_tokenize, sent_tokenize

brown_word_tokens = word_tokenize(brown_sample)
brown_sent_tokens = sent_tokenize(brown_sample)

reuters_word_tokens = word_tokenize(reuters_sample)
reuters_sent_tokens = sent_tokenize(reuters_sample)

print("Brown - Tokenization:")
print(f"Word tokens: {len(brown_word_tokens)}")
print(f"Sentence tokens: {len(brown_sent_tokens)}")
print("First 15 word tokens:", brown_word_tokens[:15])

print("\nReuters - Tokenization:")
print(f"Word tokens: {len(reuters_word_tokens)}")
print(f"Sentence tokens: {len(reuters_sent_tokens)}")
print("First 15 word tokens:", reuters_word_tokens[:15])

Brown - Tokenization:
Word tokens: 302
Sentence tokens: 11
First 15 word tokens: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election']

Reuters - Tokenization:
Word tokens: 305
Sentence tokens: 20
First 15 word tokens: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between']


### Task 3: Stemming

In [23]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

brown_words_for_stem = [w for w in brown_word_tokens if w.isalpha()][:20]
reuters_words_for_stem = [w for w in reuters_word_tokens if w.isalpha()][:20]

brown_stemmed = [(w, stemmer.stem(w)) for w in brown_words_for_stem]
reuters_stemmed = [(w, stemmer.stem(w)) for w in reuters_words_for_stem]

print("Brown - Stemming:")
for original, stemmed in brown_stemmed[:10]:
    print(f"{original} -> {stemmed}")

print("\nReuters - Stemming:")
for original, stemmed in reuters_stemmed[:10]:
    print(f"{original} -> {stemmed}")

Brown - Stemming:
The -> the
Fulton -> fulton
County -> counti
Grand -> grand
Jury -> juri
said -> said
Friday -> friday
an -> an
investigation -> investig
of -> of

Reuters - Stemming:
ASIAN -> asian
EXPORTERS -> export
FEAR -> fear
DAMAGE -> damag
FROM -> from
U -> u
S -> s
JAPAN -> japan
RIFT -> rift
Mounting -> mount


### Task 4: Lemmatization

In [24]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

brown_lemmatized = [(w, lemmatizer.lemmatize(w)) for w in brown_words_for_stem]
reuters_lemmatized = [(w, lemmatizer.lemmatize(w)) for w in reuters_words_for_stem]

print("Brown - Lemmatization:")
for original, lemmatized in brown_lemmatized[:10]:
    print(f"{original} -> {lemmatized}")

print("\nReuters - Lemmatization:")
for original, lemmatized in reuters_lemmatized[:10]:
    print(f"{original} -> {lemmatized}")

Brown - Lemmatization:
The -> The
Fulton -> Fulton
County -> County
Grand -> Grand
Jury -> Jury
said -> said
Friday -> Friday
an -> an
investigation -> investigation
of -> of

Reuters - Lemmatization:
ASIAN -> ASIAN
EXPORTERS -> EXPORTERS
FEAR -> FEAR
DAMAGE -> DAMAGE
FROM -> FROM
U -> U
S -> S
JAPAN -> JAPAN
RIFT -> RIFT
Mounting -> Mounting


### Task 5: POS Tagging

In [25]:
from nltk import pos_tag

brown_pos = pos_tag(brown_word_tokens[:30])
reuters_pos = pos_tag(reuters_word_tokens[:30])

print("Brown - POS Tagging:")
print(brown_pos)

print("\nReuters - POS Tagging:")
print(reuters_pos)

Brown - POS Tagging:
[('The', 'DT'), ('Fulton', 'NNP'), ('County', 'NNP'), ('Grand', 'NNP'), ('Jury', 'NNP'), ('said', 'VBD'), ('Friday', 'NNP'), ('an', 'DT'), ('investigation', 'NN'), ('of', 'IN'), ('Atlanta', 'NNP'), ("'s", 'POS'), ('recent', 'JJ'), ('primary', 'JJ'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'DT'), ('evidence', 'NN'), ('``', '``'), ('that', 'IN'), ('any', 'DT'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.'), ('The', 'DT'), ('jury', 'NN'), ('further', 'RB'), ('said', 'VBD')]

Reuters - POS Tagging:
[('ASIAN', 'NNP'), ('EXPORTERS', 'NNP'), ('FEAR', 'NNP'), ('DAMAGE', 'NNP'), ('FROM', 'NNP'), ('U', 'NNP'), ('.', '.'), ('S', 'NNP'), ('.-', 'JJ'), ('JAPAN', 'NNP'), ('RIFT', 'NNP'), ('Mounting', 'NNP'), ('trade', 'NN'), ('friction', 'NN'), ('between', 'IN'), ('the', 'DT'), ('U', 'NNP'), ('.', '.'), ('S', 'NNP'), ('.', '.'), ('And', 'CC'), ('Japan', 'NNP'), ('has', 'VBZ'), ('raised', 'VBN'), ('fears', 'NNS'), ('among', 'IN'), (

### Task 6: Removing Punctuation

In [26]:
import string

brown_no_punct = [w for w in brown_word_tokens if w not in string.punctuation]
reuters_no_punct = [w for w in reuters_word_tokens if w not in string.punctuation]

print("Brown - Removing Punctuation:")
print(f"Original: {len(brown_word_tokens)} tokens")
print(f"After removal: {len(brown_no_punct)} tokens")
print(brown_no_punct[:20])

print("\nReuters - Removing Punctuation:")
print(f"Original: {len(reuters_word_tokens)} tokens")
print(f"After removal: {len(reuters_no_punct)} tokens")
print(reuters_no_punct[:20])

Brown - Removing Punctuation:
Original: 302 tokens
After removal: 283 tokens
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', '``']

Reuters - Removing Punctuation:
Original: 305 tokens
After removal: 263 tokens
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', 'S', 'And', 'Japan', 'has']


### Task 7: Lowercasing

In [27]:
brown_lower = [w.lower() for w in brown_word_tokens[:30]]
reuters_lower = [w.lower() for w in reuters_word_tokens[:30]]

print("Brown - Lowercasing:")
print("Original:", brown_word_tokens[:15])
print("Lowercased:", brown_lower[:15])

print("\nReuters - Lowercasing:")
print("Original:", reuters_word_tokens[:15])
print("Lowercased:", reuters_lower[:15])

Brown - Lowercasing:
Original: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election']
Lowercased: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlanta', "'s", 'recent', 'primary', 'election']

Reuters - Lowercasing:
Original: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between']
Lowercased: ['asian', 'exporters', 'fear', 'damage', 'from', 'u', '.', 's', '.-', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between']


### Task 8: Split

In [28]:
brown_split = brown_sample.split()
reuters_split = reuters_sample.split()

print("Brown - Split:")
print(f"Total tokens: {len(brown_split)}")
print(brown_split[:20])

print("\nReuters - Split:")
print(f"Total tokens: {len(reuters_split)}")
print(reuters_split[:20])

Brown - Split:
Total tokens: 300
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']

Reuters - Split:
Total tokens: 300
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.']


### Task 9: 4 Additional functions

In [29]:
from nltk import FreqDist, ngrams, collocations
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords

# Prepare data for analysis
brown_tokens = nltk.word_tokenize(brown_sample)
reuters_tokens = nltk.word_tokenize(reuters_sample)

# Remove stopwords for better analysis
stop_words = set(stopwords.words('english'))
brown_filtered = [w.lower() for w in brown_tokens if w.isalpha() and w.lower() not in stop_words]
reuters_filtered = [w.lower() for w in reuters_tokens if w.isalpha() and w.lower() not in stop_words]

# 1. Frequency Distribution
print("=" * 60)
print("1. FREQUENCY DISTRIBUTION - Word Count & Common Words")
print("=" * 60)

brown_freq = FreqDist(brown_filtered)
reuters_freq = FreqDist(reuters_filtered)

print("\nBrown Corpus:")
print(f"Total unique words: {len(brown_freq)}")
print(f"Most common words: {brown_freq.most_common(10)}")

print("\nReuters Corpus:")
print(f"Total unique words: {len(reuters_freq)}")
print(f"Most common words: {reuters_freq.most_common(10)}")

# 2. N-grams - Word Pattern Analysis
print("\n" + "=" * 60)
print("2. N-GRAMS - Word Pattern Analysis")
print("=" * 60)

brown_bigrams = list(ngrams(brown_filtered, 2))
brown_trigrams = list(ngrams(brown_filtered, 3))
reuters_bigrams = list(ngrams(reuters_filtered, 2))
reuters_trigrams = list(ngrams(reuters_filtered, 3))

print("\nBrown Corpus:")
print(f"Sample Bigrams (2-grams): {brown_bigrams[:10]}")
print(f"Sample Trigrams (3-grams): {brown_trigrams[:5]}")

print("\nReuters Corpus:")
print(f"Sample Bigrams (2-grams): {reuters_bigrams[:10]}")
print(f"Sample Trigrams (3-grams): {reuters_trigrams[:5]}")

# Most common bigrams
brown_bigram_freq = FreqDist(brown_bigrams)
reuters_bigram_freq = FreqDist(reuters_bigrams)
print(f"\nMost common Brown bigrams: {brown_bigram_freq.most_common(5)}")
print(f"Most common Reuters bigrams: {reuters_bigram_freq.most_common(5)}")

# 3. Collocations - Meaningful Word Pairs
print("\n" + "=" * 60)
print("3. COLLOCATIONS - Meaningful Word Pairs")
print("=" * 60)

# Use bigram collocation finder
brown_bigram_measures = collocations.BigramAssocMeasures()
brown_finder = collocations.BigramCollocationFinder.from_words(brown_filtered)
brown_finder.apply_freq_filter(2)  # Only bigrams that appear at least 2 times

reuters_bigram_measures = collocations.BigramAssocMeasures()
reuters_finder = collocations.BigramCollocationFinder.from_words(reuters_filtered)
reuters_finder.apply_freq_filter(2)

print("\nBrown Corpus - Top Collocations (PMI):")
print(brown_finder.nbest(brown_bigram_measures.pmi, 10))

print("\nReuters Corpus - Top Collocations (PMI):")
print(reuters_finder.nbest(reuters_bigram_measures.pmi, 10))

# 4. Chunking - Phrase Extraction
print("\n" + "=" * 60)
print("4. CHUNKING - Phrase Extraction")
print("=" * 60)

# POS tag the original tokens (with punctuation)
brown_pos = nltk.pos_tag(brown_tokens[:50])
reuters_pos = nltk.pos_tag(reuters_tokens[:50])

# Define chunk grammar for noun phrases
chunk_grammar = r"""
  NP: {<DT>?<JJ>*<NN.*>+}  # Noun phrases
  VP: {<VB.*><NP|PP>}       # Verb phrases
  PP: {<IN><NP>}            # Prepositional phrases
"""
chunk_parser = nltk.RegexpParser(chunk_grammar)

# Chunk the sentences
brown_chunks = chunk_parser.parse(brown_pos)
reuters_chunks = chunk_parser.parse(reuters_pos)

print("\nBrown Corpus - Extracted Chunks:")
for subtree in brown_chunks.subtrees(filter=lambda t: t.label() != 'S'):
    print(f"{subtree.label()}: {' '.join([word for word, tag in subtree.leaves()])}")

print("\nReuters Corpus - Extracted Chunks:")
for subtree in reuters_chunks.subtrees(filter=lambda t: t.label() != 'S'):
    print(f"{subtree.label()}: {' '.join([word for word, tag in subtree.leaves()])}")

print("\n" + "=" * 60)
print("SUMMARY OF ADDITIONAL FUNCTIONS")
print("=" * 60)
print("✓ Frequency Distribution: Analyzed word counts and identified most common words")
print("✓ N-grams: Extracted bigrams and trigrams for pattern analysis")
print("✓ Collocations: Found meaningful word pairs using PMI scoring")
print("✓ Chunking: Extracted noun phrases, verb phrases, and prepositional phrases")

1. FREQUENCY DISTRIBUTION - Word Count & Common Words

Brown Corpus:
Total unique words: 104
Most common words: [('jury', 8), ('said', 7), ('city', 6), ('election', 5), ('fulton', 4), ('atlanta', 3), ('county', 2), ('grand', 2), ('primary', 2), ('irregularities', 2)]

Reuters Corpus:
Total unique words: 115
Most common words: [('u', 6), ('said', 6), ('japan', 4), ('tariffs', 4), ('japanese', 3), ('electronics', 3), ('asian', 2), ('exporters', 2), ('damage', 2), ('businessmen', 2)]

2. N-GRAMS - Word Pattern Analysis

Brown Corpus:
Sample Bigrams (2-grams): [('fulton', 'county'), ('county', 'grand'), ('grand', 'jury'), ('jury', 'said'), ('said', 'friday'), ('friday', 'investigation'), ('investigation', 'atlanta'), ('atlanta', 'recent'), ('recent', 'primary'), ('primary', 'election')]
Sample Trigrams (3-grams): [('fulton', 'county', 'grand'), ('county', 'grand', 'jury'), ('grand', 'jury', 'said'), ('jury', 'said', 'friday'), ('said', 'friday', 'investigation')]

Reuters Corpus:
Sample Bi

### Task 10: Wordnet

In [30]:
from nltk.corpus import wordnet as wn

print("=" * 60)
print("WORDNET EXPLORATION")
print("=" * 60)

# Select some words from our corpora for analysis
test_words = ['bank', 'good', 'run', 'company', 'market']

for word in test_words:
    print(f"\n{'='*60}")
    print(f"Word: '{word.upper()}'")
    print(f"{'='*60}")
    
    # Get synsets (synonym sets)
    synsets = wn.synsets(word)
    print(f"\nNumber of synsets: {len(synsets)}")
    
    # Show first 3 synsets with details
    for i, syn in enumerate(synsets[:3], 1):
        print(f"\n{i}. Synset: {syn.name()}")
        print(f"   POS: {syn.pos()} (n=noun, v=verb, a=adjective, r=adverb)")
        print(f"   Definition: {syn.definition()}")
        print(f"   Examples: {syn.examples()}")
        print(f"   Lemmas: {[lemma.name() for lemma in syn.lemmas()]}")
    
    # Demonstrate synonyms and antonyms
    if synsets:
        first_synset = synsets[0]
        print(f"\nSynonyms of '{word}' (from first synset):")
        synonyms = set()
        for lemma in first_synset.lemmas():
            synonyms.add(lemma.name())
        print(f"   {synonyms}")
        
        print(f"\nAntonyms of '{word}' (if available):")
        antonyms = set()
        for lemma in first_synset.lemmas():
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
        print(f"   {antonyms if antonyms else 'No direct antonyms found'}")

# Demonstrate semantic similarity
print(f"\n{'='*60}")
print("SEMANTIC SIMILARITY")
print(f"{'='*60}")

# Compare similar words
word_pairs = [
    ('bank', 'river'),
    ('bank', 'money'),
    ('good', 'bad'),
    ('run', 'walk')
]

for word1, word2 in word_pairs:
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    
    if synsets1 and synsets2:
        # Use Wu-Palmer similarity (0 to 1, higher is more similar)
        similarity = synsets1[0].wup_similarity(synsets2[0])
        print(f"\nSimilarity between '{word1}' and '{word2}': {similarity:.3f}")
        print(f"   {word1}: {synsets1[0].definition()}")
        print(f"   {word2}: {synsets2[0].definition()}")

# Demonstrate hypernyms and hyponyms
print(f"\n{'='*60}")
print("HYPERNYMS AND HYPONYMS")
print(f"{'='*60}")

word = 'company'
synsets = wn.synsets(word)
if synsets:
    syn = synsets[0]
    print(f"\nWord: '{word}'")
    print(f"Synset: {syn.name()}")
    print(f"Definition: {syn.definition()}")
    
    # Hypernyms (more general terms)
    print(f"\nHypernyms (more general):")
    for hyper in syn.hypernyms()[:3]:
        print(f"   - {hyper.name()}: {hyper.definition()}")
    
    # Hyponyms (more specific terms)
    print(f"\nHyponyms (more specific):")
    for hypo in syn.hyponyms()[:5]:
        print(f"   - {hypo.name()}: {hypo.definition()}")

print(f"\n{'='*60}")
print("WORDNET SUMMARY")
print(f"{'='*60}")
print("✓ Explored word senses and definitions")
print("✓ Found synonyms and antonyms")
print("✓ Calculated semantic similarity between words")
print("✓ Identified hypernyms (general) and hyponyms (specific)")

WORDNET EXPLORATION

Word: 'BANK'

Number of synsets: 18

1. Synset: bank.n.01
   POS: n (n=noun, v=verb, a=adjective, r=adverb)
   Definition: sloping land (especially the slope beside a body of water)
   Examples: ['they pulled the canoe up on the bank', 'he sat on the bank of the river and watched the currents']
   Lemmas: ['bank']

2. Synset: depository_financial_institution.n.01
   POS: n (n=noun, v=verb, a=adjective, r=adverb)
   Definition: a financial institution that accepts deposits and channels the money into lending activities
   Examples: ['he cashed a check at the bank', 'that bank holds the mortgage on my home']
   Lemmas: ['depository_financial_institution', 'bank', 'banking_concern', 'banking_company']

3. Synset: bank.n.03
   POS: n (n=noun, v=verb, a=adjective, r=adverb)
   Definition: a long ridge or pile
   Examples: ['a huge bank of earth']
   Lemmas: ['bank']

Synonyms of 'bank' (from first synset):
   {'bank'}

Antonyms of 'bank' (if available):
   No direct ant

### Task 11: Additional Lexical resource

In [31]:
from nltk.corpus import names, words, cmudict
import random

print("=" * 60)
print("ADDITIONAL LEXICAL RESOURCES")
print("=" * 60)

# 1. NAMES CORPUS - Common first names
print("\n" + "=" * 60)
print("1. NAMES CORPUS")
print("=" * 60)

male_names = names.words('male.txt')
female_names = names.words('female.txt')

print(f"\nTotal male names: {len(male_names)}")
print(f"Total female names: {len(female_names)}")
print(f"Sample male names: {random.sample(male_names, 10)}")
print(f"Sample female names: {random.sample(female_names, 10)}")

# Check if a word is a name
test_words = ['John', 'Alice', 'market', 'company', 'Michael', 'Sarah']
print("\nName Detection:")
for word in test_words:
    is_male = word in male_names
    is_female = word in female_names
    if is_male:
        print(f"   '{word}' is a MALE name")
    elif is_female:
        print(f"   '{word}' is a FEMALE name")
    else:
        print(f"   '{word}' is NOT a common first name")

# 2. WORDS CORPUS - English dictionary words
print("\n" + "=" * 60)
print("2. WORDS CORPUS - English Dictionary")
print("=" * 60)

english_words = set(words.words())
print(f"\nTotal words in dictionary: {len(english_words)}")
print(f"Sample words: {random.sample(list(english_words), 15)}")

# Spell checking
test_words = ['hello', 'wrld', 'company', 'businss', 'market', 'analyss']
print("\nSpell Checking:")
for word in test_words:
    if word.lower() in english_words:
        print(f"   ✓ '{word}' is spelled correctly")
    else:
        print(f"   ✗ '{word}' is NOT in dictionary (possible misspelling)")

# Find words by pattern
print("\nWords starting with 'financi' (first 10):")
financial_words = [w for w in english_words if w.lower().startswith('financi')]
print(f"   {sorted(financial_words)[:10]}")

# 3. CMU PRONOUNCING DICTIONARY - Phonetic transcriptions
print("\n" + "=" * 60)
print("3. CMU PRONOUNCING DICTIONARY")
print("=" * 60)

pronouncing_dict = cmudict.dict()
print(f"\nTotal words with pronunciations: {len(pronouncing_dict)}")

# Get pronunciations
test_words = ['company', 'market', 'finance', 'analysis', 'python']
print("\nPhonetic Pronunciations:")
for word in test_words:
    if word.lower() in pronouncing_dict:
        pronunciations = pronouncing_dict[word.lower()]
        print(f"\n   '{word}':")
        for i, pron in enumerate(pronunciations, 1):
            print(f"      {i}. {' '.join(pron)}")
    else:
        print(f"\n   '{word}': No pronunciation found")

# Find rhyming words
def get_rhyme_part(phonemes):
    """Extract the part of pronunciation used for rhyming"""
    for i, phoneme in enumerate(phonemes):
        if phoneme[-1].isdigit():  # Find first stressed vowel
            return phonemes[i:]
    return phonemes

print("\n" + "=" * 60)
print("RHYME DETECTION")
print("=" * 60)

word = 'good'
if word.lower() in pronouncing_dict:
    target_pron = pronouncing_dict[word.lower()][0]
    target_rhyme = get_rhyme_part(target_pron)
    
    print(f"\nFinding words that rhyme with '{word}':")
    print(f"Rhyme pattern: {' '.join(target_rhyme)}")
    
    rhymes = []
    for w, prons in list(pronouncing_dict.items())[:1000]:  # Check first 1000 words
        for pron in prons:
            if get_rhyme_part(pron) == target_rhyme and w != word.lower():
                rhymes.append(w)
                break
    
    print(f"Rhyming words (from first 1000): {rhymes[:10]}")

# Syllable counting
print("\n" + "=" * 60)
print("SYLLABLE COUNTING")
print("=" * 60)

def count_syllables(word):
    """Count syllables in a word using CMU dict"""
    if word.lower() in pronouncing_dict:
        # Count vowel phonemes (those ending in digits)
        return len([ph for ph in pronouncing_dict[word.lower()][0] if ph[-1].isdigit()])
    return 0

test_words = ['company', 'business', 'analysis', 'market', 'financial', 'economy']
print("\nSyllable counts:")
for word in test_words:
    syllables = count_syllables(word)
    print(f"   '{word}': {syllables} syllable(s)")

print("\n" + "=" * 60)
print("LEXICAL RESOURCES SUMMARY")
print("=" * 60)
print("✓ Names Corpus: Identified common first names (male/female)")
print("✓ Words Corpus: Dictionary lookup and spell checking")
print("✓ CMU Dictionary: Phonetic transcriptions, rhyme detection, syllable counting")

ADDITIONAL LEXICAL RESOURCES

1. NAMES CORPUS

Total male names: 2943
Total female names: 5001
Sample male names: ['Simmonds', 'Isa', 'Collins', 'Salmon', 'Antin', 'Philip', 'Wojciech', 'Barthel', 'Filip', 'Bradly']
Sample female names: ['Davine', 'Ariadne', 'Arleen', 'Neda', 'Cordelie', 'Dorolice', 'Viola', 'Eolande', 'Lillian', 'Amelie']

Name Detection:
   'John' is a MALE name
   'Alice' is a FEMALE name
   'market' is NOT a common first name
   'company' is NOT a common first name
   'Michael' is a MALE name
   'Sarah' is a FEMALE name

2. WORDS CORPUS - English Dictionary

Total words in dictionary: 235892
Sample words: ['berther', 'augurate', 'Logris', 'doon', 'suaveness', 'arrayment', 'pachylosis', 'rookeried', 'summage', 'Cestraciontes', 'pugilistically', 'pleurectomy', 'urometer', 'doddypoll', 'velours']

Spell Checking:
   ✓ 'hello' is spelled correctly
   ✗ 'wrld' is NOT in dictionary (possible misspelling)
   ✓ 'company' is spelled correctly
   ✗ 'businss' is NOT in dictio