In [27]:
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk import pos_tag

In [4]:
example_text = """16/09/2004 Set in Varanasi, Gopal, Aarti and Raghav have been best friends since school. As teenagers, Gopal has fallen in love with Aarti, often pushing to be more than friends. Aarti consistently rebuffs, expressing that she values their friendship and isn't ready for a relationship with anybody.

Gopal and Raghav are both studying to get into engineering colleges, but Gopal gets a low rank in the All India Engineering Entrance Exams, while Raghav is among the toppers. To help improve his rank, Gopal moves to Kota alone to join reputed coaching classes to resit the exam the following year. Raghav goes on to pass the IIT entrance exam as well, and starts attending Banaras Hindu University.

During Gopal's absence in Kota, Aarti and Raghav develop feelings for each other, and Aarti tells Gopal about her relationship when they chat online. Gopal is heartbroken and lashes out at her, causing them to lose touch briefly and Gopal to fall behind in his course. When they rekindle their friendship, Gopal begins to study hard again, but gets a low rank once more after sitting the AIEEE exam for the second time. Gopal returns home to Varanasi, where his father dies shortly after learning of his low rank. Meanwhile, Raghav and Aarti's relationship is flourishing, and Raghav has found his passion in activism and being on his university's newspaper committee.
"""

In [5]:
words = word_tokenize(example_text)

In [13]:
stop_words = set(stopwords.words("english"))

In [14]:
filtered_words = [word for word in words if not word in stop_words]

In [15]:
filtered_words

['16/09/2004',
 'Set',
 'Varanasi',
 ',',
 'Gopal',
 ',',
 'Aarti',
 'Raghav',
 'best',
 'friends',
 'since',
 'school',
 '.',
 'As',
 'teenagers',
 ',',
 'Gopal',
 'fallen',
 'love',
 'Aarti',
 ',',
 'often',
 'pushing',
 'friends',
 '.',
 'Aarti',
 'consistently',
 'rebuffs',
 ',',
 'expressing',
 'values',
 'friendship',
 "n't",
 'ready',
 'relationship',
 'anybody',
 '.',
 'Gopal',
 'Raghav',
 'studying',
 'get',
 'engineering',
 'colleges',
 ',',
 'Gopal',
 'gets',
 'low',
 'rank',
 'All',
 'India',
 'Engineering',
 'Entrance',
 'Exams',
 ',',
 'Raghav',
 'among',
 'toppers',
 '.',
 'To',
 'help',
 'improve',
 'rank',
 ',',
 'Gopal',
 'moves',
 'Kota',
 'alone',
 'join',
 'reputed',
 'coaching',
 'classes',
 'resit',
 'exam',
 'following',
 'year',
 '.',
 'Raghav',
 'goes',
 'pass',
 'IIT',
 'entrance',
 'exam',
 'well',
 ',',
 'starts',
 'attending',
 'Banaras',
 'Hindu',
 'University',
 '.',
 'During',
 'Gopal',
 "'s",
 'absence',
 'Kota',
 ',',
 'Aarti',
 'Raghav',
 'develop',


lemmatizer and stop words

In [17]:
lemmatizer = WordNetLemmatizer()

In [18]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [20]:
pos_tags = pos_tag(words)
# pos_tags

In [23]:
lemmatized_non_stop_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos) or wordnet.NOUN) 
                             for word, pos in pos_tags if word.lower() not in stop_words]

In [26]:
print("Lemmatized (excluding stop words):", lemmatized_non_stop_words)

Lemmatized (excluding stop words): ['16/09/2004', 'Set', 'Varanasi', ',', 'Gopal', ',', 'Aarti', 'Raghav', 'best', 'friend', 'since', 'school', '.', 'teenager', ',', 'Gopal', 'fall', 'love', 'Aarti', ',', 'often', 'push', 'friend', '.', 'Aarti', 'consistently', 'rebuff', ',', 'express', 'value', 'friendship', "n't", 'ready', 'relationship', 'anybody', '.', 'Gopal', 'Raghav', 'study', 'get', 'engineering', 'college', ',', 'Gopal', 'get', 'low', 'rank', 'India', 'Engineering', 'Entrance', 'Exams', ',', 'Raghav', 'among', 'topper', '.', 'help', 'improve', 'rank', ',', 'Gopal', 'move', 'Kota', 'alone', 'join', 'reputed', 'coaching', 'class', 'resit', 'exam', 'following', 'year', '.', 'Raghav', 'go', 'pass', 'IIT', 'entrance', 'exam', 'well', ',', 'start', 'attend', 'Banaras', 'Hindu', 'University', '.', 'Gopal', "'s", 'absence', 'Kota', ',', 'Aarti', 'Raghav', 'develop', 'feeling', ',', 'Aarti', 'tell', 'Gopal', 'relationship', 'chat', 'online', '.', 'Gopal', 'heartbroken', 'lash', ',', 'c

Stop Words Stemming

In [28]:
stemmer = PorterStemmer()

In [29]:
stemmed_non_stop_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
print("Stemmed (excluding stop words):", stemmed_non_stop_words)

Stemmed (excluding stop words): ['16/09/2004', 'set', 'varanasi', ',', 'gopal', ',', 'aarti', 'raghav', 'best', 'friend', 'sinc', 'school', '.', 'teenag', ',', 'gopal', 'fallen', 'love', 'aarti', ',', 'often', 'push', 'friend', '.', 'aarti', 'consist', 'rebuff', ',', 'express', 'valu', 'friendship', "n't", 'readi', 'relationship', 'anybodi', '.', 'gopal', 'raghav', 'studi', 'get', 'engin', 'colleg', ',', 'gopal', 'get', 'low', 'rank', 'india', 'engin', 'entranc', 'exam', ',', 'raghav', 'among', 'topper', '.', 'help', 'improv', 'rank', ',', 'gopal', 'move', 'kota', 'alon', 'join', 'reput', 'coach', 'class', 'resit', 'exam', 'follow', 'year', '.', 'raghav', 'goe', 'pass', 'iit', 'entranc', 'exam', 'well', ',', 'start', 'attend', 'banara', 'hindu', 'univers', '.', 'gopal', "'s", 'absenc', 'kota', ',', 'aarti', 'raghav', 'develop', 'feel', ',', 'aarti', 'tell', 'gopal', 'relationship', 'chat', 'onlin', '.', 'gopal', 'heartbroken', 'lash', ',', 'caus', 'lose', 'touch', 'briefli', 'gopal', '