#### Sopwords are common words that appear frequently in text but usually don’t carry significant meaning or help in analysis — so they’re often removed during text preprocessing

##### Examples of Stopwords: a, an the, in, or, at, off, for etc


In [166]:
corpus = """The quiet morning air was filled with the soft hum of distant traffic and the gentle rustling of leaves. 
Sunlight filtered through the trees, painting golden patterns on the sidewalk. A man jogged past, earbuds in, lost in his rhythm. 
Nearby, a small café opened its doors, releasing the aroma of freshly brewed coffee into the street.
People began to stir, stepping out of their homes, ready to start another day.
The world felt calm yet alive, as if every small movement was part of a larger, invisible rhythm guiding the city toward a new beginning."""

In [167]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [168]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\12368\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [169]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [170]:
print(len(stopwords.words('english')))

198


In [171]:
sentences = sent_tokenize(corpus)
print(sentences)

['The quiet morning air was filled with the soft hum of distant traffic and the gentle rustling of leaves.', 'Sunlight filtered through the trees, painting golden patterns on the sidewalk.', 'A man jogged past, earbuds in, lost in his rhythm.', 'Nearby, a small café opened its doors, releasing the aroma of freshly brewed coffee into the street.', 'People began to stir, stepping out of their homes, ready to start another day.', 'The world felt calm yet alive, as if every small movement was part of a larger, invisible rhythm guiding the city toward a new beginning.']


In [172]:
type(sentences)

list

In [173]:
lemmatizer = WordNetLemmatizer()

In [192]:
corpus_without_stopwords = []
stopwords = set(stopwords.words('english'))

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stopwords]
    corpus_without_stopwords.append(' '.join(words))

In [194]:
corpus_without_stopwords

['quiet morning air filled soft hum distant traffic gentle rustling leaf .',
 'Sunlight filtered tree , painting golden pattern sidewalk .',
 'man jogged past , earbuds , lost rhythm .',
 'Nearby , small café opened door , releasing aroma freshly brewed coffee street .',
 'People began stir , stepping home , ready start another day .',
 'world felt calm yet alive , every small movement part larger , invisible rhythm guiding city toward new beginning .']