
**Library required**

In [1]:
!pip install nltk




**Text **

In [2]:
text = "NGC 4889 is a massive, early-type galaxy located in the Coma Cluster, approximately 300 million light-years from Earth, and is notable for hosting a supermassive black hole at its core that is estimated to have a mass of over 21 billion solar masses."

In [3]:
text

'NGC 4889 is a massive, early-type galaxy located in the Coma Cluster, approximately 300 million light-years from Earth, and is notable for hosting a supermassive black hole at its core that is estimated to have a mass of over 21 billion solar masses.'

**Stopwords**

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords

In [14]:
stop_words = stopwords.words('english')

In [15]:
from nltk.tokenize import word_tokenize
words = word_tokenize(text)

**Applying stop words**

In [16]:
holder = list()
for w in words:
    if w not in set(stop_words):
        holder.append(w)

In [17]:
holder

['NGC',
 '4889',
 'massive',
 ',',
 'early-type',
 'galaxy',
 'located',
 'Coma',
 'Cluster',
 ',',
 'approximately',
 '300',
 'million',
 'light-years',
 'Earth',
 ',',
 'notable',
 'hosting',
 'supermassive',
 'black',
 'hole',
 'core',
 'estimated',
 'mass',
 '21',
 'billion',
 'solar',
 'masses',
 '.']

**List Comprehension for stop words**

In [18]:
holder = [w for w in words if w not in set(stop_words)]
print(holder)

['NGC', '4889', 'massive', ',', 'early-type', 'galaxy', 'located', 'Coma', 'Cluster', ',', 'approximately', '300', 'million', 'light-years', 'Earth', ',', 'notable', 'hosting', 'supermassive', 'black', 'hole', 'core', 'estimated', 'mass', '21', 'billion', 'solar', 'masses', '.']


**Stemming**

In [19]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [20]:
porter = PorterStemmer()
snow = SnowballStemmer(language = 'english')
lancaster = LancasterStemmer()

In [21]:
words = ['play', 'plays', 'played', 'playing', 'player']

**Porter Stemmer**

In [22]:
porter_stemmed = list()
for w in words:
    stemmed_words = porter.stem(w)
    porter_stemmed.append(stemmed_words)

In [23]:
porter_stemmed

['play', 'play', 'play', 'play', 'player']


**Porter Stemmer List Comprehension**

In [40]:
porter_stemmed = [porter.stem(x) for x in words]
print (porter_stemmed)

['play', 'play', 'play', 'play', 'player']



**Snowball Stemmer**

In [26]:
snow_stemmed = list()
for w in words:
    stemmed_words = snow.stem(w)
    snow_stemmed.append(stemmed_words)

In [27]:
snow_stemmed

['play', 'play', 'play', 'play', 'player']


**Snowball Stemmer List Comprehension**

In [28]:
snow_stemmed = [snow.stem(x) for x in words]
print (snow_stemmed)

['play', 'play', 'play', 'play', 'player']


**Lancaster Stemmer**

In [29]:
lancaster_stemmed = list()
for w in words:
    stemmed_words = lancaster.stem(w)
    lancaster_stemmed.append(stemmed_words)

In [30]:

lancaster_stemmed

['play', 'play', 'play', 'play', 'play']


**Lancaster Stemmer List Comprehension**

In [31]:
lancaster_stemmed = [lancaster.stem(x) for x in words]
print (lancaster_stemmed)

['play', 'play', 'play', 'play', 'play']
['play', 'play', 'play', 'play', 'play']


**Lemmatization : This has a more expansive vocabulary than Stemming**

In [35]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [36]:
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

In [38]:
lemmatized = [wordnet.lemmatize(x) for x in words]

In [39]:
lemmatized

['play', 'play', 'played', 'playing', 'player']