In [5]:
# program 1 (Tokenization)
from nltk.tokenize import sent_tokenize,word_tokenize
text = '''The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language 
Processing.  A free online book is available. If you use the library for academic research, please
cite the book.'''
sent = sent_tokenize(text)
word = word_tokenize(text)
print('Original Text:')
print(text)
print('\nSentence Tokenize:')
print(sent)
print('\nWord Tokenize:')
print(word)

Original Text:
The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language 
Processing.  A free online book is available. If you use the library for academic research, please
cite the book.

Sentence Tokenize:
['The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language \nProcessing.', 'A free online book is available.', 'If you use the library for academic research, please\ncite the book.']

Word Tokenize:
['The', 'Natural', 'Language', 'Toolkit', '(', 'NLTK', ')', 'is', 'an', 'open', 'source', 'Python', 'library', 'for', 'Natural', 'Language', 'Processing', '.', 'A', 'free', 'online', 'book', 'is', 'available', '.', 'If', 'you', 'use', 'the', 'library', 'for', 'academic', 'research', ',', 'please', 'cite', 'the', 'book', '.']


In [7]:
# program 2 (Stopwords)
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

text = '''The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language 
Processing.  A free online book is available. If you use the library for academic research, please
cite the book.'''
word = word_tokenize(text)
stop_word = stopwords.words('english')
filter_word = []

for i in [x.lower() for x in word]:
    if i not in stop_word:
        filter_word.append(i)
print(filter_word)

['natural', 'language', 'toolkit', '(', 'nltk', ')', 'open', 'source', 'python', 'library', 'natural', 'language', 'processing', '.', 'free', 'online', 'book', 'available', '.', 'use', 'library', 'academic', 'research', ',', 'please', 'cite', 'book', '.']


In [13]:
# program 3 (Stemming)
from nltk.stem import PorterStemmer

In [14]:
ps = PorterStemmer()

In [15]:
words = ['running','quickly','programming','foxes','happily']
for i in words:
    print(i,':',ps.stem(i))

running : run
quickly : quickli
programming : program
foxes : fox
happily : happili


In [16]:
# program 4 (Lemmatization)
from nltk.stem import WordNetLemmatizer

In [17]:
ls = WordNetLemmatizer()

In [22]:
words = ['books','goes','programming','foxes','scientific']
for i in words:
    print(i,':',ls.lemmatize(i))

books : book
goes : go
programming : programming
foxes : fox
scientific : scientific


In [26]:
# program 5 (Regular expression emails retrival)
import re
text = '''Please contact our support team at support@example.com for any assistance. If you have 
questions about your order, email sales@company123.com. We're excited to hear your feedback at 
feedback@emailco.net. Reach out to our marketing department at marketing.team@yourcompany.org. 
For general inquiries, email info@website1234.org.'''
lst = re.findall('\S+@\S+',text)
print(lst)

['support@example.com', 'sales@company123.com.', 'feedback@emailco.net.', 'marketing.team@yourcompany.org.', 'info@website1234.org.']


In [27]:
# program 6 (Regular expression numbers retrival)
import re
text = '''In a survey, 7 out of 9 people prefer using smartphones for communication, while 3 out 
of 4 still use traditional landlines. The average age of the participants was 5, and 2 in 8 were 
under 1 years old. We observed a 5% increase in web traffic this month, and the company's stock 
price rose by 9%.'''
lst = re.findall('[0-9]+',text)
print(lst)

['7', '9', '3', '4', '5', '2', '8', '1', '5', '9']


In [28]:
# program 7 (Word Punctuation)
import nltk
from nltk.tokenize import WordPunctTokenizer
sent = ["The sun was setting over the horizon; its warm, golden glow painted the sky in shades of orange and pink.",
"She couldn't believe her luck as she held the winning lottery ticket, worth a million dollars!",
"Despite his initial reservations, John decided to accept the job offer in New York City; he was excited about the new opportunities it would bring."]
tokenizor = WordPunctTokenizer()
tokenized_sent = [tokenizor.tokenize(sentence) for sentence in sent]
for i,tokens in enumerate(tokenized_sent):
    print(f'Sentence {i+1}',tokens)

Sentence 1 ['The', 'sun', 'was', 'setting', 'over', 'the', 'horizon', ';', 'its', 'warm', ',', 'golden', 'glow', 'painted', 'the', 'sky', 'in', 'shades', 'of', 'orange', 'and', 'pink', '.']
Sentence 2 ['She', 'couldn', "'", 't', 'believe', 'her', 'luck', 'as', 'she', 'held', 'the', 'winning', 'lottery', 'ticket', ',', 'worth', 'a', 'million', 'dollars', '!']
Sentence 3 ['Despite', 'his', 'initial', 'reservations', ',', 'John', 'decided', 'to', 'accept', 'the', 'job', 'offer', 'in', 'New', 'York', 'City', ';', 'he', 'was', 'excited', 'about', 'the', 'new', 'opportunities', 'it', 'would', 'bring', '.']


In [32]:
# program 8 (Monolingual Corpus)
import nltk
from nltk.util import bigrams,trigrams
from nltk import FreqDist
from nltk.corpus import reuters

corpus_word = reuters.words()
corpus_length = len(corpus_word)

word_freq = FreqDist(corpus_word)

bigram_freq = FreqDist(list(bigrams(corpus_word)))
trigram_freq = FreqDist(list(trigrams(corpus_word)))

distinct_word = len(word_freq)
print(f'Distinct Word: {distinct_word}')

print('\nTop 10 most common words:')
print(word_freq.most_common(10))

print('\nTop 10 most common bigrams:')
print(bigram_freq.most_common(10))

print('\nTop 10 most common trigrams:')
print(trigram_freq.most_common(10))

Distinct Word: 41600

Top 10 most common words:
[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037)]

Top 10 most common bigrams:
[((',', '000'), 10266), (("'", 's'), 9220), (('lt', ';'), 8693), (('&', 'lt'), 8688), (('.', 'The'), 8530), (('said', '.'), 7888), (('of', 'the'), 6803), (('in', 'the'), 6487), (('U', '.'), 6350), (('.', 'S'), 5833)]

Top 10 most common trigrams:
[(('&', 'lt', ';'), 8687), (('U', '.', 'S'), 5693), (('.', 'S', '.'), 5360), ((',', '000', 'vs'), 2577), (('the', 'U', '.'), 1959), ((',', '000', 'dlrs'), 1524), (('said', '.', 'The'), 1516), (('.', '5', 'mln'), 1337), (('he', 'said', '.'), 1229), ((',', '000', 'Revs'), 1198)]


In [35]:
# program 9 (POS Tags)
from nltk import pos_tag
from nltk.tokenize import word_tokenize
text = 'The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing.'
for i in text.split():
    print(pos_tag(word_tokenize(i)))

[('The', 'DT')]
[('Natural', 'JJ')]
[('Language', 'NN')]
[('Toolkit', 'NN')]
[('(', '('), ('NLTK', 'NNP'), (')', ')')]
[('is', 'VBZ')]
[('an', 'DT')]
[('open', 'JJ')]
[('source', 'NN')]
[('Python', 'NN')]
[('library', 'NN')]
[('for', 'IN')]
[('Natural', 'JJ')]
[('Language', 'NN')]
[('Processing', 'NN'), ('.', '.')]
