### Install the NLTK library ###

In [1]:
import nltk

In [45]:

nltk.download('punkt_tab')  #It only needs to be run once.
from nltk.tokenize import word_tokenize, sent_tokenize


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [49]:
text = "hello!. I'm fereshte. I love learning NLP"

#Divide into sentences:
sentences = sent_tokenize(text)
print("sentences:", sentences)

# Divide into words:
words = word_tokenize(text)
print("words:", words)

sentences: ['hello!.', "I'm fereshte.", 'I love learning NLP']
words: ['hello', '!', '.', 'I', "'m", 'fereshte', '.', 'I', 'love', 'learning', 'NLP']


**Stop words are used frequently in the language but do not add any specific meaning to our analysis.like is,an,a,the,...**

In [57]:
from nltk.corpus import stopwords
nltk.download('stopwords')

#List of most frequent English words:
stop_words_en = stopwords.words('english')

sample_text = "This is an example showing off stop word filtration."

words = word_tokenize(sample_text)
filtered_words = [w for w in words if w.lower() not in stop_words_en]

print("Words without stopwords:", filtered_words)


Words without stopwords: ['example', 'showing', 'stop', 'word', 'filtration', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming(Simple etymology of words) ###

In [68]:
from nltk.stem import PorterStemmer
#Porter is one of the most famous stemming algorithms in English.
stemmer = PorterStemmer()

words = ["learning", "learned", "learner", "learning"]

stems = [stemmer.stem(word) for word in words]
print("Roots of words:", stems)


Roots of words: ['learn', 'learn', 'learner', 'learn']


### Lemmatization(More accurate etymology by understanding the meaning) ###

In [39]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')  
#This tool is a data source that provides synonyms and related words in different languages.
#Version 1.4 is the fourth version of this resource in nltk.

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("better", pos="a"))  # adjective
print(lemmatizer.lemmatize("running", pos="v"))  # verb


[nltk_data] Downloading package wordnet to /Users/macbook/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/macbook/nltk_data...


good
run


### POS Tagging( Recognize the type of words) ###

In [72]:
nltk.download('averaged_perceptron_tagger_eng')
#This line of code is used to download the POS Tagging model in NLTK.


text = "Fereshte is learning natural language processing."

tokens = word_tokenize(text)
tags = nltk.pos_tag(tokens)

print("POS Tags:", tags)


POS Tags: [('Fereshte', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
#NNP: Proper Noun, Singular → "Fereshte"
#VBZ: verb (Verb, 3rd person singular present) → "is"
#VBG: Verb (Verb, gerund/present participle) → "learning"
#JJ: Adjective → "natural"
#NN: Noun, singular → "language"
#NN: Noun, singular → "processing"